In [1]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/refine-epitope-deep-learning')

Mounted at /content/drive/


In [2]:
!pip install transformers
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 66.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

# Preprocess data 

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [4]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [5]:
def preprocess_data(data):

    # Preprocess data
    X = list(data["sequence"])
    y = list(data["label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)
    return train_dataset, val_dataset

In [6]:
df_train = pd.read_csv("./input/data_train.csv")

sequence_formatted = []
for seq in df_train['sequence'].values:
  sequence_formatted.append(" ".join(seq))

data = pd.DataFrame({'sequence':sequence_formatted, 'label':df_train['label'].tolist()})

#data = df_train

data_op = data[:int(len(data)/2)]


# Define pretrained tokenizer and model
batch_size=8
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

train_dataset_op, val_dataset_op = preprocess_data(data_op)
train_dataset, val_dataset = preprocess_data(data)

# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    f"{model_name}-finetuned-classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    optim="adamw_torch"
)

def model_init():
    return model

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset_op,
    eval_dataset=val_dataset_op,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")



Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.546304,0.614458,0.415094,0.956522,0.578947
2,No log,0.359152,0.843373,0.678571,0.826087,0.745098
3,No log,0.360683,0.831325,0.695652,0.695652,0.695652
4,No log,0.368075,0.843373,0.692308,0.782609,0.734694
5,No log,0.369946,0.843373,0.708333,0.73913,0.723404


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-0/checkpoint-42
Configuration saved in bert-base-uncased-finetuned-classification/run-0/checkpoint-42/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-0/checkpoint-42/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/run-0/checkpoint-42/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-classification/run-0/checkpoint-42/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-0/checkpoint-84
Configuration saved in bert-base-uncased-finetuned-classification/run-0/checkpoint-84/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-0/checkpoint-84/pytorch_model.bin
tokenizer config file saved in bert-base-uncased

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.368284,0.831325,0.68,0.73913,0.708333


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-1/checkpoint-42
Configuration saved in bert-base-uncased-finetuned-classification/run-1/checkpoint-42/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-1/checkpoint-42/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/run-1/checkpoint-42/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-classification/run-1/checkpoint-42/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-base-uncased-finetuned-classification/run-1/checkpoint-42 (score: 0.7083333333333334).
[32m[I 2022-07-06 21:50:28,001][0m Trial 1 finished with value: 2.9587890693207615 and parameters: {'learning_rate': 5.598627372287537e-06, 'num_train_epochs': 1, 'seed': 21, 'per_device_train_batch_size'

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.364647,0.843373,0.708333,0.73913,0.723404


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-2/checkpoint-42
Configuration saved in bert-base-uncased-finetuned-classification/run-2/checkpoint-42/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-2/checkpoint-42/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/run-2/checkpoint-42/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-classification/run-2/checkpoint-42/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from bert-base-uncased-finetuned-classification/run-2/checkpoint-42 (score: 0.723404255319149).
[32m[I 2022-07-06 21:50:52,166][0m Trial 2 finished with value: 3.0142415174109947 and parameters: {'learning_rate': 6.157575481226498e-06, 'num_train_epochs': 1, 'seed': 1, 'per_device_train_batch_size': 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.36866,0.86747,0.7,0.913043,0.792453
2,No log,0.390878,0.843373,0.708333,0.73913,0.723404
3,No log,0.388486,0.843373,0.708333,0.73913,0.723404
4,No log,0.372493,0.843373,0.708333,0.73913,0.723404


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-3/checkpoint-42
Configuration saved in bert-base-uncased-finetuned-classification/run-3/checkpoint-42/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-3/checkpoint-42/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/run-3/checkpoint-42/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-classification/run-3/checkpoint-42/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-3/checkpoint-84
Configuration saved in bert-base-uncased-finetuned-classification/run-3/checkpoint-84/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-3/checkpoint-84/pytorch_model.bin
tokenizer config file saved in bert-base-uncased

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.366602,0.843373,0.666667,0.869565,0.754717
2,No log,0.365615,0.86747,0.772727,0.73913,0.755556


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-4/checkpoint-42
Configuration saved in bert-base-uncased-finetuned-classification/run-4/checkpoint-42/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-4/checkpoint-42/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/run-4/checkpoint-42/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-classification/run-4/checkpoint-42/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/run-4/checkpoint-84
Configuration saved in bert-base-uncased-finetuned-classification/run-4/checkpoint-84/config.json
Model weights saved in bert-base-uncased-finetuned-classification/run-4/checkpoint-84/pytorch_model.bin
tokenizer config file saved in bert-base-uncased

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.449832,0.807229,0.652174,0.652174,0.652174


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
[32m[I 2022-07-06 21:53:34,062][0m Trial 5 pruned. [0m
Trial:
***** Running training *****
  Num examples = 330
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 168


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.558187,0.662651,0.44898,0.956522,0.611111


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
[32m[I 2022-07-06 21:53:51,299][0m Trial 6 pruned. [0m
Trial:
***** Running training *****
  Num examples = 330
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 42


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.556754,0.674699,0.458333,0.956522,0.619718


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
[32m[I 2022-07-06 21:54:08,532][0m Trial 7 pruned. [0m
Trial:
***** Running training *****
  Num examples = 330
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 126


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.448385,0.746988,0.52381,0.956522,0.676923


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
[32m[I 2022-07-06 21:54:25,724][0m Trial 8 pruned. [0m
Trial:
***** Running training *****
  Num examples = 330
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 84


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.38724,0.831325,0.68,0.73913,0.708333


***** Running Evaluation *****
  Num examples = 83
  Batch size = 8
[32m[I 2022-07-06 21:54:42,918][0m Trial 9 pruned. [0m


## Set the model with the best parameters and run it on the full dataset

In [7]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train_dataset=train_dataset
trainer.eval_dataset=val_dataset

trainer.train()

***** Running training *****
  Num examples = 660
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 166


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.283431,0.86747,0.854167,0.732143,0.788462
2,No log,0.272089,0.879518,0.86,0.767857,0.811321


***** Running Evaluation *****
  Num examples = 166
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/checkpoint-83
Configuration saved in bert-base-uncased-finetuned-classification/checkpoint-83/config.json
Model weights saved in bert-base-uncased-finetuned-classification/checkpoint-83/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/checkpoint-83/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-classification/checkpoint-83/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 166
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-classification/checkpoint-166
Configuration saved in bert-base-uncased-finetuned-classification/checkpoint-166/config.json
Model weights saved in bert-base-uncased-finetuned-classification/checkpoint-166/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-classification/checkpoint-166/to

TrainOutput(global_step=166, training_loss=0.32443432635571584, metrics={'train_runtime': 91.9071, 'train_samples_per_second': 14.362, 'train_steps_per_second': 1.806, 'total_flos': 655948194343200.0, 'train_loss': 0.32443432635571584, 'epoch': 2.0})

In [8]:
# ----- 3. Predict -----#
# Load test data
#test_data = pd.read_csv("test.csv")
test = pd.read_csv("./input/data_test.csv")

sequence_formatted = []
for seq in test['sequence'].values:
  sequence_formatted.append(" ".join(seq))

test_data = pd.DataFrame({'sequence':sequence_formatted, 'label':test['label'].tolist()})


X_test = list(test_data["sequence"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)


***** Running Prediction *****
  Num examples = 207
  Batch size = 8


In [9]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

print("ROC_AUC:", roc_auc_score(test_data['label'], y_pred))

print(classification_report(test_data['label'], y_pred))

ROC_AUC: 0.8216195569136746
              precision    recall  f1-score   support

           0       0.82      0.92      0.87       119
           1       0.86      0.73      0.79        88

    accuracy                           0.84       207
   macro avg       0.84      0.82      0.83       207
weighted avg       0.84      0.84      0.83       207

