In [25]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/refine-epitope-deep-learning')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [26]:
!pip install transformers
!pip install optuna
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Preprocess data 

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
#from transformers import EarlyStoppingCallback
from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [28]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [29]:
def preprocess_data(data):

    # Preprocess data
    X = list(data["sequence"])
    y = list(data["label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)
    return train_dataset, val_dataset

In [30]:
df_train = pd.read_csv("./input/data_train.csv")

sequence_formatted = []
for seq in df_train['sequence'].values:
  sequence_formatted.append(" ".join(seq))

data = pd.DataFrame({'sequence':sequence_formatted, 'label':df_train['label'].tolist()})

#data = df_train

data_op = data[:int(len(data)/5)]


# Define pretrained tokenizer and model
batch_size=8
#model_name = "cardiffnlp/twitter-roberta-base-emotion"
model_name = 'roberta-base' #'roberta-large' , 'roberta-large-mnli'

tokenizer = RobertaTokenizer.from_pretrained(model_name)

model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)


train_dataset_op, val_dataset_op = preprocess_data(data_op)
train_dataset, val_dataset = preprocess_data(data)

# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    f"{model_name}-finetuned-classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    
    #evaluation_strategy ='steps',
    #eval_steps = 50, # Evaluation and Save happens every 50 steps
    #save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    optim="adamw_torch"
)

def model_init():
    return model

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset_op,
    eval_dataset=val_dataset_op,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
  #  callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)


best_run = trainer.hyperparameter_search(n_trials=5, direction="maximize")



loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.583367,0.515152,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to roberta-base-finetuned-classification/run-0/checkpoint-17
Configuration saved in roberta-base-finetuned-classification/run-0/checkpoint-17/config.json
Model weights saved in roberta-base-finetuned-classification/run-0/checkpoint-17/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/run-0/checkpoint-17/tokenizer_config.json
Special tokens file saved in roberta-base-finetuned-classification/run-0/checkpoint-17/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from roberta-base-finetuned-classification/run-0/checkpoint-17 (score: 0.0).
[32m[I 2022-06-21 01:52:07,064][0m Trial 0 finished with value: 0.5151515151515151 and parameters: {'learning_rate': 5.7179973095939476e-05, 'num_train_epochs': 1, 'seed': 17, 'per_device_train_

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.556034,0.515152,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to roberta-base-finetuned-classification/run-1/checkpoint-17
Configuration saved in roberta-base-finetuned-classification/run-1/checkpoint-17/config.json
Model weights saved in roberta-base-finetuned-classification/run-1/checkpoint-17/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/run-1/checkpoint-17/tokenizer_config.json
Special tokens file saved in roberta-base-finetuned-classification/run-1/checkpoint-17/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from roberta-base-finetuned-classification/run-1/checkpoint-17 (score: 0.0).
[32m[I 2022-06-21 01:52:29,459][0m Trial 1 finished with value: 0.5151515151515151 and parameters: {'learning_rate': 2.875699826973366e-06, 'num_train_epochs': 1, 'seed': 9, 'per_device_train_ba

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.539043,0.515152,0.0,0.0,0.0
2,No log,0.561637,0.515152,0.0,0.0,0.0
3,No log,0.558612,0.515152,0.0,0.0,0.0
4,No log,0.559338,0.515152,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to roberta-base-finetuned-classification/run-2/checkpoint-17
Configuration saved in roberta-base-finetuned-classification/run-2/checkpoint-17/config.json
Model weights saved in roberta-base-finetuned-classification/run-2/checkpoint-17/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/run-2/checkpoint-17/tokenizer_config.json
Special tokens file saved in roberta-base-finetuned-classification/run-2/checkpoint-17/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to roberta-base-finetuned-classification/run-2/checkpoint-34
Configuration saved in roberta-base-finetuned-classification/run-2/checkpoint-34/config.json
Model weights saved in roberta-base-finetuned-classification/run-2/checkpoint

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.407955,0.848485,0.789474,0.9375,0.857143


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
Saving model checkpoint to roberta-base-finetuned-classification/run-3/checkpoint-17
Configuration saved in roberta-base-finetuned-classification/run-3/checkpoint-17/config.json
Model weights saved in roberta-base-finetuned-classification/run-3/checkpoint-17/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/run-3/checkpoint-17/tokenizer_config.json
Special tokens file saved in roberta-base-finetuned-classification/run-3/checkpoint-17/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from roberta-base-finetuned-classification/run-3/checkpoint-17 (score: 0.8571428571428572).
[32m[I 2022-06-21 01:54:01,611][0m Trial 3 finished with value: 3.432601389838232 and parameters: {'learning_rate': 2.84776956713175e-05, 'num_train_epochs': 1, 'seed': 6, 'per_device_train_batch_size': 8}. Best is trial 3 with value:

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.523114,0.878788,0.928571,0.8125,0.866667
2,No log,0.463035,0.848485,0.923077,0.75,0.827586
3,No log,0.682931,0.757576,0.9,0.5625,0.692308
4,No log,0.435951,0.818182,0.916667,0.6875,0.785714
5,No log,0.254751,0.878788,0.928571,0.8125,0.866667


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
Saving model checkpoint to roberta-base-finetuned-classification/run-4/checkpoint-17
Configuration saved in roberta-base-finetuned-classification/run-4/checkpoint-17/config.json
Model weights saved in roberta-base-finetuned-classification/run-4/checkpoint-17/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/run-4/checkpoint-17/tokenizer_config.json
Special tokens file saved in roberta-base-finetuned-classification/run-4/checkpoint-17/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
Saving model checkpoint to roberta-base-finetuned-classification/run-4/checkpoint-34
Configuration saved in roberta-base-finetuned-classification/run-4/checkpoint-34/config.json
Model weights saved in roberta-base-finetuned-classification/run-4/checkpoint-34/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/run-4/checkpoint-34

## Set the model with the best parameters and run it on the full dataset

In [31]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train_dataset=train_dataset
trainer.eval_dataset=val_dataset

trainer.train()

***** Running training *****
  Num examples = 660
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 415


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.444188,0.843373,0.882353,0.692308,0.775862
2,No log,0.393227,0.825301,0.8,0.738462,0.768
3,No log,0.40881,0.825301,0.790323,0.753846,0.771654
4,No log,0.513111,0.837349,0.851852,0.707692,0.773109
5,No log,0.457833,0.837349,0.827586,0.738462,0.780488


***** Running Evaluation *****
  Num examples = 166
  Batch size = 8
Saving model checkpoint to roberta-base-finetuned-classification/checkpoint-83
Configuration saved in roberta-base-finetuned-classification/checkpoint-83/config.json
Model weights saved in roberta-base-finetuned-classification/checkpoint-83/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/checkpoint-83/tokenizer_config.json
Special tokens file saved in roberta-base-finetuned-classification/checkpoint-83/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 166
  Batch size = 8
Saving model checkpoint to roberta-base-finetuned-classification/checkpoint-166
Configuration saved in roberta-base-finetuned-classification/checkpoint-166/config.json
Model weights saved in roberta-base-finetuned-classification/checkpoint-166/pytorch_model.bin
tokenizer config file saved in roberta-base-finetuned-classification/checkpoint-166/tokenizer_config.json
Special tokens file saved

TrainOutput(global_step=415, training_loss=0.3547350780073419, metrics={'train_runtime': 223.8652, 'train_samples_per_second': 14.741, 'train_steps_per_second': 1.854, 'total_flos': 741079009638000.0, 'train_loss': 0.3547350780073419, 'epoch': 5.0})

In [32]:
# ----- 3. Predict -----#
# Load test data
#test_data = pd.read_csv("test.csv")
test = pd.read_csv("./input/data_test.csv")

sequence_formatted = []
for seq in test['sequence'].values:
  sequence_formatted.append(" ".join(seq))

test_data = pd.DataFrame({'sequence':sequence_formatted, 'label':test['label'].tolist()})


X_test = list(test_data["sequence"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 207
  Batch size = 8


In [33]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

print("ROC_AUC:", roc_auc_score(test_data['label'], y_pred))

print(classification_report(test_data['label'], y_pred))

ROC_AUC: 0.8443468296409472
              precision    recall  f1-score   support

           0       0.84      0.92      0.88       119
           1       0.87      0.77      0.82        88

    accuracy                           0.86       207
   macro avg       0.86      0.84      0.85       207
weighted avg       0.86      0.86      0.85       207

