In [None]:
# !pip install git+https://github.com/adapter-hub/adapters.git
# !pip install wandb
# !pip install pandas
# !pip install datasets

# requires ipykernel package

In [None]:
# !pip install accelerate -U

In [11]:
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel, Seq2SeqTrainingArguments, BertTokenizer, Seq2SeqTrainer, AutoModel, AutoModelForCausalLM, DataCollatorForSeq2Seq, GenerationConfig, DataCollatorWithPadding
from adapters import BnConfig, Seq2SeqAdapterTrainer, AdapterTrainer, BertAdapterModel, init
import wandb
import torch
import pandas as pd
from datasets import Dataset
import os
import datasets
import numpy as np
import re

In [2]:
# print device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [8]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("Exscientia/IgBert", "Exscientia/IgBert", add_cross_attention=True)
init(model)

Some weights of BertModel were not initialized from the model checkpoint at Exscientia/IgBert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at Exscientia/IgBert and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder

In [9]:
config = BnConfig(mh_adapter=True, output_adapter=True, reduction_factor=16, non_linearity="relu")

model.add_adapter("seq2seq_adapter", config=config)
model.set_active_adapters("seq2seq_adapter")
model.train_adapter("seq2seq_adapter")


In [6]:
model.named_parameters

<bound method Module.named_parameters of EncoderDecoderModelWithAdapters(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30, 1024, padding_idx=0)
      (position_embeddings): Embedding(40000, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-29): 30 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttentionWithAdapters(
              (query): LoRALinearTorch(
                in_features=1024, out_features=1024, bias=True
                (loras): ModuleDict()
              )
              (key): LoRALinearTorch(
                in_features=1024, out_features=1024, bias=True
                (loras): ModuleDict()
              )
              (value): LoRALinearTorch(
                in_features=1024, out_features=1024

In [12]:
# Loop through all parameters and enable gradient computation only for 'crossattention' parameters
for name, param in model.named_parameters():
    if re.match(".*crossattention.*", name):
        param.requires_grad = True
    else:
        param.requires_grad = False  # Assuming you want to freeze other parameters

# Your model is now set up to train only the cross-attention layers and the added adapter.

In [13]:
#print(f"print EncoderDecoderModel: {model}")

# Load the tokenizer and model from Hugging Face
tokenizer = BertTokenizer.from_pretrained("Exscientia/IgBert")

In [35]:
generation_config = GenerationConfig(
    num_return_sequences=1,
    max_length=512,
    min_length=50,
    early_stopping = True,
    
    length_penalty = -2.0,
    
    num_beams = 3,

    # sampling
    do_sample=True,
    top_k=50,
    
    no_repeat_ngram_size = 2,

    # distribution adjustment
    temperature=0.001,
    repetition_penalty=1,

    vocab_size=model.config.encoder.vocab_size,

    # token ids
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.sep_token_id,
    decoder_start_token_id=tokenizer.cls_token_id,

    # others
    use_cache=True,
    output_logits=True,
    output_scores=True,
    output_hidden_states=True,
    return_dict_in_generate=True, )


In [36]:
generation_config.save_pretrained("generation_config", "generation_config_5.json")

In [37]:
generation_config_name = "generation_config_5"
generation_config = GenerationConfig.from_pretrained("generation_config", f"{generation_config_name}.json")

In [39]:
batch_size = 32
num_train_epochs = 3
learning_rate = 1e-4


# Set up the run name
run_name=f"freeze_small_data_with_adapters_batch_size_{batch_size}_epochs_{num_train_epochs}_automodel_lr_{learning_rate}_{generation_config_name}"

output_dir = f"./{run_name}"
logging_dir = f"./{run_name}_logging"

In [40]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    report_to="wandb",
    run_name=run_name,
    generation_config=generation_config,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Create directories if they do not exist
os.makedirs(training_args.output_dir, exist_ok=True)
os.makedirs(training_args.logging_dir, exist_ok=True)

# Log in to Weights & Biases
#wandb.login()


wandb.init(project="bert2bert-translation", name=run_name)

In [41]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(line.strip())

    sequences = []
    for entry in data:
        split_entry = entry.split(' [SEP] ')
        if (len(split_entry) == 2):
            sequences.append(split_entry)
        else:
            print(f"Skipping invalid entry: {entry}")

    df = pd.DataFrame(sequences, columns=['heavy', 'light'])
    return df



In [None]:
!pip show datasets


In [42]:
# Load training and validation data

train_file_path = '/ibmm_data2/oas_database/paired_lea_tmp/paired_model/BERT2BERT/data/paired_full_seqs_sep_train_no_ids_small_SPACE_separated.txt'
val_file_path = '/ibmm_data2/oas_database/paired_lea_tmp/paired_model/BERT2BERT/data/paired_full_seqs_sep_val_no_ids_small_SPACE_separated.txt'
#test_file_path = '/ibmm_data2/oas_database/paired_lea_tmp/paired_model/train_test_val_datasets/heavy_sep_light_seq/paired_full_seqs_sep_test_no_ids_space_separated_SMALL.txt'

In [43]:
train_df = load_data(train_file_path)
val_df = load_data(val_file_path)
#test_df = load_data(test_file_path)


encoder_max_length = 200
decoder_max_length = 200

def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(batch["light"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["heavy"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    #batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # Ignore PAD token in the labels
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [44]:
# Convert the dataframes to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['heavy', 'light']])
val_dataset = Dataset.from_pandas(val_df[['heavy', 'light']])
#test_dataset = Dataset.from_pandas(test_df[['heavy', 'light']])


train_data = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
)

# "decoder_input_ids",
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
)

val_data = val_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
)

# "decoder_input_ids",
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
)


# test_data = test_dataset.map(
#     process_data_to_model_inputs,   
#     batched=True,
#     batch_size=batch_size,
# )   

# # "decoder_input_ids",
# test_data.set_format(
#     type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
# )




# print heavy and light seq from the first example in the training data (train_dataset)
print(f"first example heavy and light seq {train_dataset[0]}, {train_dataset[1]}")


# Initialize the trainer
trainer = Seq2SeqAdapterTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    adapter_names=["seq2seq_adapter"],
)


Map: 100%|██████████| 200/200 [00:00<00:00, 847.23 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 821.70 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


first example heavy and light seq {'heavy': 'Q V Q L Q E S G P G L V K P S E T L S L T C T V S G G S I S G F Y W S W I R Q S P G K G L E W I A Y I Y F S G S T N Y N P S L K S R V T L S V D T S K N Q F S L K L S S V T A A D S A V Y Y C A R D V G P Y N S I S P G R Y Y F D Y W G P G T L V T V S S', 'light': 'Q S A L T Q P A S V S G S P G Q S I T I S C T G T S S D V G N Y N L V S W Y Q H H P G K A P K L M I Y E V S K R P S G I S N R F S G S K S G N T A S L T I S G L Q A D D E A D Y Y C C S Y A G S R I L Y V F G S G T K V T V L'}, {'heavy': 'Q V Q L Q E S G P G L V K P S E T L S L T C T V S G G S I S S Y H W S W I R Q P P G K G L E W I G Y M Y Y S G S T N Y N P S L K S R V T I S V D T S K T Q F S L K L S S V T T A D T A V Y Y C A R G R L I W S A D Y T G G D Y F D P W G Q G I L V T V S S', 'light': 'Q S A L T Q P A S V S G S P G Q S I T I S C T G S S S D V G S Y N L V S W Y Q Q H P G K A P K L M I Y E V S K R P S G V S N R F S G S K S G N T A S L T I S G L Q A E D E A Q Y Y C C S Y G G R N F

In [45]:
model.generation_config

GenerationConfig {
  "decoder_start_token_id": 2,
  "do_sample": true,
  "early_stopping": true,
  "eos_token_id": 3,
  "length_penalty": -2.0,
  "max_length": 512,
  "min_length": 50,
  "no_repeat_ngram_size": 2,
  "num_beams": 3,
  "output_hidden_states": true,
  "output_logits": true,
  "output_scores": true,
  "pad_token_id": 0,
  "return_dict_in_generate": true,
  "temperature": 0.001,
  "vocab_size": 30
}

In [46]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [47]:
#print(f"trainer.get_train_dataloader().collate_fn: {trainer.get_train_dataloader().collate_fn}")

# Train the model
trainer.train()
#trainer.evaluate()






OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/transformers/models/encoder_decoder/modeling_encoder_decoder.py", line 626, in forward
    decoder_outputs = self.decoder(
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1201, in forward
    outputs = self.bert(
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/adapters/context.py", line 116, in wrapper_func
    results = f(self, *args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/adapters/model_mixin.py", line 1350, in forward
    return super().forward(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 988, in forward
    encoder_outputs = self.encoder(
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 582, in forward
    layer_outputs = layer_module(
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1582, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 498, in forward
    cross_attention_outputs = self.crossattention(
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 411, in forward
    attention_output = self.output(self_outputs[0], hidden_states)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/adapters/models/bert/modeling_bert.py", line 149, in forward
    hidden_states = self.bottleneck_layer_forward(hidden_states, input_tensor, self.LayerNorm)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/adapters/methods/bottleneck.py", line 369, in bottleneck_layer_forward
    hidden_states = last_adapter.post_forward(hidden_states, input_hidden_states, residual_input, layer_norm)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/adapters/methods/modeling.py", line 218, in post_forward
    hidden_states = layer_norm(hidden_states + input_tensor)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/modules/normalization.py", line 201, in forward
    return F.layer_norm(
  File "/home/leab/anaconda3/envs/adap_2/lib/python3.9/site-packages/torch/nn/functional.py", line 2573, in layer_norm
    return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
input_prompt = "S T G V A F M E I N G L R S D D T A T Y F C A I N R V G D R G S N P S Y F Q D W G Q G T R V T V S S "
print(f"input_prompt: {input_prompt}")

inputs = tokenizer(input_prompt, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

print(f"attention_mask: {attention_mask}")

#input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
print(f"input_ids: {input_ids}")

# Generate text using the model
generated_seq = model.generate(input_ids=input_ids, 
                               attention_mask=attention_mask, 
                               max_length=100, 
                               output_scores=True, 
                               return_dict_in_generate=True)

# Turn output scores to probabilities
# generated_seq_probs = torch.nn.functional.softmax(generated_seq['scores'][0], dim=-1)

# Access the first element in the generated sequence
sequence = generated_seq["sequences"][0]

# Print the generated sequences and probabilities
print(f"encoded heavy sequence: {sequence}.")

# Convert the generated IDs back to text
generated_text = tokenizer.decode(sequence, skip_special_tokens=True)

print("decoded heavy sequence: ", generated_text)

# print(test_data)

# Load your test data
test_file_path = '/kaggle/input/test-file/paired_full_seqs_sep_test_no_ids_space_separated_SMALL.txt'
test_df = load_data(test_file_path)


# extract the light sequences from test_df
light_sequences = test_df["light"]

print("light_sequences: ", light_sequences)
print(f"length of light sequences {len(light_sequences)}")

generated_heavy_seqs = []

# Iterate through each sequence in the test dataset
for i in range(50):
    inputs = tokenizer(light_sequences[i], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    generated_seq = model.generate(input_ids=input_ids, 
                               attention_mask=attention_mask, 
                               max_length=100, 
                               output_scores=True, 
                               return_dict_in_generate=True,
                                   generation_config=generation_config)
    
    # Access the first element in the generated sequence
    sequence = generated_seq["sequences"][0]

    # Print the generated sequences and probabilities
    print(f"encoded heavy sequence: {sequence}.")

    # Convert the generated IDs back to text
    generated_text = tokenizer.decode(sequence, skip_special_tokens=True)

    print("decoded heavy sequence: ", generated_text)

    generated_heavy_seqs.append(generated_text)


print("generated_heavy_seqs:")
# print each generated sequence on new line
for seq in generated_heavy_seqs:
    print(seq)