# Requirements

### Download

In [1]:
!pip install transformers datasets accelerate nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.15.0-py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.5/191.5 KB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Libraries

In [2]:
#basics
import pandas as pd
import numpy as np
import gc
from numba import cuda

#Torch
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#Split data
from sklearn.model_selection import train_test_split

#metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

#Bert
from transformers import BertModel, BertTokenizer, TrainingArguments, Trainer, logging

In [26]:
"""gc.collect()

torch.cuda.empty_cache()

cuda.select_device(0)
cuda.close()
cuda.select_device(0)"""

'gc.collect()\n\ntorch.cuda.empty_cache()\n\ncuda.select_device(0)\ncuda.close()\ncuda.select_device(0)'

### Else

In [3]:
graph_labels_path = "/content/drive/MyDrive/graph_labels.txt"
sequences_path = "/content/drive/MyDrive/sequences.txt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

# Get Data

### Load data

In [5]:
def read_data_sequence():
  # Read sequences
  sequences = list()
  with open(sequences_path, "r") as f:
      for line in f:
          sequences.append(line[:-1])

  # Split data into training and test sets
  sequences_train = list()
  sequences_test = list()
  proteins_test = list()
  y_train = list()
  with open(graph_labels_path, "r") as f:
      for i, line in enumerate(f):
          t = line.split(",")
          if len(t[1][:-1]) == 0:
              proteins_test.append(t[0])
              sequences_test.append(sequences[i])
          else:
              sequences_train.append(sequences[i])
              y_train.append(int(t[1][:-1]))
  return sequences_train, sequences_test, proteins_test, y_train
sequences_train, sequences_test, proteins_test, y_train = read_data_sequence()

In [6]:
sequences_train = [" ".join(seq) for seq in sequences_train]
sequences_test = [" ".join(seq) for seq in sequences_test]

# Preprocessing

### Split Data

In [7]:
test_size = .25
X_train, X_valid, y_train, y_valid = train_test_split(sequences_train, y_train, shuffle= True, test_size = test_size)

### Create Dataset and Dataloader

In [8]:
#max_len = np.quantile([len(x) for x in sequences_train],.9)
max_len = 100

In [9]:
X_train = [x[:max_len] for x in X_train]
X_valid = [x[:max_len] for x in X_valid]

In [10]:
tokenizer_bert = "Rostlab/prot_bert"
tokenizer = BertTokenizer.from_pretrained(tokenizer_bert, do_lower_case=False )

Downloading:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/361 [00:00<?, ?B/s]

In [11]:
class ProteinSequenceDataset(Dataset):
    def __init__(self, sequence, targets, tokenizer, max_len):
        self.sequence = sequence
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, item):
        sequence_one = str(self.sequence[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
            sequence_one,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
          'protein_sequence': sequence_one,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }


In [12]:
batch_size = 4

training_dataset = ProteinSequenceDataset(X_train, y_train,tokenizer,max_len)
valid_dataset = ProteinSequenceDataset(X_valid, y_valid,tokenizer, max_len)
training_dataloader = DataLoader(training_dataset, batch_size = batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle=True)

## Build Model and metrics

In [13]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [14]:
model_bert = "Rostlab/prot_bert_bfd_localization"

### Fine tune Bert

In [15]:
class ProteinClassifier(nn.Module):
    def __init__(self, model_bert, n_classes, dropout):
        super(ProteinClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_bert).to(device)
        self.classifier = nn.Sequential(nn.Dropout(p=dropout),
                                        nn.Linear(self.bert.config.hidden_size, n_classes),
                                        nn.Tanh()).to(device)
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        ).pooler_output
        hat_y = self.classifier(output) 
        del output
        return hat_y

In [16]:
#Define model
dropout = .2
n_class = len(set(y_train))
#model
model = ProteinClassifier(model_bert, n_class, dropout).to(device)

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    fp16=True
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=training_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of the model checkpoint at Rostlab/prot_bert_bfd_localization were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using cuda_amp half precision backend


In [17]:
torch.cuda.empty_cache()

In [18]:
print_gpu_utilization()

GPU memory occupied: 2388 MB.


In [42]:
!nvidia-smi

Wed Jan 18 08:55:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    41W /  70W |   4300MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [19]:
logging.set_verbosity_error()
trainer.train()
#print_summary(result)



RuntimeError: ignored