In [3]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/refine-epitope-deep-learning')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!pip install transformers
!pip install optuna

In [5]:
from transformers import BertConfig, BertForPreTraining, BertTokenizerFast, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', padding=True, truncation=True, max_length=512)

#with open("clean.txt", 'r') as fp:
#    text = fp.read().split('\n')

In [6]:
config = BertConfig(    # https://huggingface.co/transformers/model_doc/bert.html#bertconfig
    #vocab_size=20000, # default It's an English standard, so you have to modify it to fit the vocab size you made.
    # hidden_size=512,
    # num_hidden_layers=12,    # layer num
    # num_attention_heads=8,    # transformer attention head number
    # intermediate_size=3072,   # transformer Dimension size of the feed-forward network within
    # hidden_act="gelu",
    # hidden_dropout_prob=0.1,
    # attention_probs_dropout_prob=0.1,
    #max_position_embeddings=512,    # embedding size Specify how many tokens to use as input
    # type_vocab_size=2,    # token type Range of ids (BERT is segmentA and segmentB, two types)
    # pad_token_id=0,
    # position_embedding_type="absolute"
)

model = BertForPreTraining(config=config)
model.num_parameters()

110106428

In [14]:
#bag_senteces = [item for sentence in text for item in sentence.split('.') if item != '']

In [7]:

with open("iedb_linear_epitopes.fasta", 'r') as fp:
    text = fp.read().split('\n')

list_positive_negative = []

for row in text:
  if ('PositiveID' not in row) and ('NegativeID' not in row): 
    list_positive_negative.append(row)


file = open("bert_iedb_linear_epitopes.txt", "w") 
for item in list_positive_negative:
  epitope= ''.join([c for c in item if c.isupper()])
  no_epitope = item.split(epitope)
  file.writelines(" ".join(no_epitope[0]) + '\n')
  file.writelines(" ".join(epitope) + '\n')
  file.writelines(" ".join(no_epitope[1]) + '\n\n')

file.close()


In [8]:
from transformers import TextDatasetForNextSentencePrediction
from transformers import DataCollatorForLanguageModeling

dataset = TextDatasetForNextSentencePrediction(
    tokenizer=tokenizer,
    file_path='bert_iedb_linear_epitopes.txt',
    block_size=512,
    overwrite_cache=False,
  #  short_seq_probability=0.1,
    nsp_probability=0.5,
)

data_collator = DataCollatorForLanguageModeling(    # We don't have to implement [MASK]! :-)
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)



In [9]:
import torch

MAX_LEN=512

ending_sep_token_tensor = torch.tensor([102])

for count, sample in enumerate(dataset.examples):
    if len(sample['input_ids'])>MAX_LEN:
        dataset.examples[count]['input_ids'] = torch.cat((sample['input_ids'][:MAX_LEN-1], ending_sep_token_tensor), 0)
        dataset.examples[count]['token_type_ids'] = sample['token_type_ids'][:MAX_LEN]

In [18]:
#dataset.examples[0]

In [19]:
#tokenizer.decode(data_collator(dataset.examples)['input_ids'][0].tolist())

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir= "./model/bert-retrained",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size= 8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    optim="adamw_torch"
)



trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)


trainer.train()
trainer.save_model("./model/bertExtendend")

***** Running training *****
  Num examples = 49671
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 31045


Step,Training Loss
500,7.1805
1000,6.4967
1500,6.423
2000,6.3965
2500,6.4251
3000,6.4196
3500,6.4104
4000,6.3851
4500,6.4083
5000,6.4426


Saving model checkpoint to ./model/bert-retrained/checkpoint-10000
Configuration saved in ./model/bert-retrained/checkpoint-10000/config.json
Model weights saved in ./model/bert-retrained/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to ./model/bert-retrained/checkpoint-20000
Configuration saved in ./model/bert-retrained/checkpoint-20000/config.json
Model weights saved in ./model/bert-retrained/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to ./model/bert-retrained/checkpoint-30000
Configuration saved in ./model/bert-retrained/checkpoint-30000/config.json
Model weights saved in ./model/bert-retrained/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [model/bert-retrained/checkpoint-10000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./model/bertExtendend
Configuration saved in ./model/bertExtendend/config.json
Model weights saved in ./model/bertExtendend

## Loading pre-trained model

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [12]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [13]:
def preprocess_data(data):

    # Preprocess data
    X = list(data["sequence"])
    y = list(data["label"])
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)
    return train_dataset, val_dataset

In [14]:
df_train = pd.read_csv("./input/data_train.csv")

sequence_formatted = []
for seq in df_train['sequence'].values:
  sequence_formatted.append(" ".join(seq))

data = pd.DataFrame({'sequence':sequence_formatted, 'label':df_train['label'].tolist()})

#data = df_train

data_op = data[:int(len(data)/2)]


# Define pretrained tokenizer and model
batch_size=8
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained("./model/bertExtendend", num_labels=2)

train_dataset_op, val_dataset_op = preprocess_data(data_op)
train_dataset, val_dataset = preprocess_data(data)

# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    f"{model_name}-finetuned-classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    optim="adamw_torch"
)

def model_init():
    return model

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset_op,
    eval_dataset=val_dataset_op,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


#best_run = trainer.hyperparameter_search(n_trials=2, direction="maximize")


loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7

In [15]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train_dataset=train_dataset
trainer.eval_dataset=val_dataset

trainer.train()

NameError: ignored

In [16]:
# ----- 3. Predict -----#
# Load test data
#test_data = pd.read_csv("test.csv")
test = pd.read_csv("./input/data_test.csv")

sequence_formatted = []
for seq in test['sequence'].values:
  sequence_formatted.append(" ".join(seq))

test_data = pd.DataFrame({'sequence':sequence_formatted, 'label':test['label'].tolist()})


X_test = list(test_data["sequence"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)


***** Running Prediction *****
  Num examples = 207
  Batch size = 8


In [17]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

print("ROC_AUC:", roc_auc_score(test_data['label'], y_pred))

print(classification_report(test_data['label'], y_pred))

ROC_AUC: 0.5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       119
           1       0.43      1.00      0.60        88

    accuracy                           0.43       207
   macro avg       0.21      0.50      0.30       207
weighted avg       0.18      0.43      0.25       207



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
