In [None]:
from datasets import load_dataset
books = load_dataset("opus_books", "en-fr", split='train[:500]')
# print(squad)
print(books[0]['translation'])

In [None]:
def english_batch_iterator():
    for sequence in books:
        text = sequence['translation']
        english, french  = text['en'], text['fr']
        yield english

def french_batch_iterator():
    for sequence in books:
        text = sequence['translation']
        english, french  = text['en'], text['fr']
        yield french

In [None]:
from src.utils.tokenizer import BpeTokenizer

english_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
english_tokenizer.tokenizer.train_from_iterator(english_batch_iterator(), trainer=english_tokenizer.trainer)
english_tokenizer.add_unk_id()
english_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "en_{}_{}".format(1000, 2048))


In [None]:
french_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
french_tokenizer.tokenizer.train_from_iterator(french_batch_iterator(), trainer=french_tokenizer.trainer)
french_tokenizer.add_unk_id()
french_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "fr_{}_{}".format(1000, 2048))

# Seperator


In [None]:
%load_ext autoreload
%autoreload 2
import argparse
import os
import wandb
from datetime import datetime
import pytz
import time
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from src.models import get_model
from src.utils.helpers import set_hyps
from src.utils.tokenizer import get_tokenizer
from src.data import get_datasets
from train import train

from datasets import load_dataset

import torch


def parse_opt():
    ################################################################ Arguments

    parser = argparse.ArgumentParser(description='Multilingual RNA Implementation')

    # Trainig Configuration
    parser.add_argument('--train-data', default="/data6/sobhan/rllm/dataset/rpm/test_rpm.txt", type=str, help='Fasta File Path')
    parser.add_argument('--eval-data', default="/data6/sobhan/rllm/dataset/rpm/eval.txt", type=str, help='Fasta File Path')
    parser.add_argument('--sanity_check', default=False, type=bool, help='Sanity Check the Implementation')

    parser.add_argument('--train-hyp', default="/data6/sobhan/rllm/hyps/train.yaml", type=str, help='Training Arguments hyperprameters')
    parser.add_argument('--model-hyp', default="/data6/sobhan/rllm/hyps/bart.yaml", type=str, help='Model hyperprameters')

    # utils
    parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')
    parser.add_argument('--results-dir', default='./results', type=str, metavar='PATH', help='path to cache (default: none)')

    # args = parser.parse_args()  # running in command line
    args = parser.parse_args('')  # running in ipynb

    args.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    return args

In [None]:
args = parse_opt()
print("============================================================================================")
# track total training time
start_time = datetime.now(pytz.timezone('Turkey')).strftime("%Y-%m-%d %H:%M")
args.start_time = start_time

print("Started training at : ", start_time)
print("============================================================================================")

# Handle Training Arguments
args = set_hyps(args.train_hyp, args)
args = set_hyps(args.model_hyp, args)
args.results_dir = os.path.join(args.results_dir, args.model)
if not os.path.exists(args.results_dir):
    os.makedirs(args.results_dir)    
args.results_dir = os.path.join(args.results_dir, "run"+str(len(os.listdir(args.results_dir)))+"_"+time.strftime("%Y%m%d-%H%M%S"))
os.makedirs(args.results_dir)

plots_dir = args.results_dir+'/plots'
os.mkdir(plots_dir)

In [None]:
dataset = load_dataset("text", data_files=args.train_data, split="train[:100]")

In [None]:
def protein_batch_iterator():
    for sequence in dataset:
        text = sequence['text']
        protein, rna  = text.strip().split('$')
        yield protein


def rna_batch_iterator():
    for sequence in dataset:
        text = sequence['text']
        protein, rna  = text.strip().split('$')
        yield rna


In [None]:
dataset = load_dataset("text", data_files=args.train_data, split="train[:100]")

In [None]:
%load_ext autoreload
%autoreload 2
from src.utils.tokenizer import BpeTokenizer

protein_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
protein_tokenizer.train_tokenizer(train_data=dataset)
protein_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "pr_{}_{}".format(1000, 2048))

In [None]:
rna_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
rna_tokenizer.train_tokenizer(train_data=dataset, which=False)
rna_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "rna_{}_{}".format(1000, 2048))

In [None]:
iter_dt = iter(dataset)

In [None]:
temp = next(iter_dt)
protein, rna  = temp['text'].strip().split('$')

In [None]:
protein
rna

In [None]:
print(protein_tokenizer.tokenize(protein).ids, rna_tokenizer.tokenize(rna).ids)

In [None]:
def tokenize_dataset(sample, protein_tokenizer, rna_tokenizer):
    text = sample['text']
    protein, rna  = text.strip().split('$')

    protein_tokenized = protein_tokenizer.tokenize(protein)
    rna_tokenized = rna_tokenizer.tokenize(rna)
    
    # need to set these to -100 to calculate the loss properly
    rna_labels = [-100 if i == 0 else i for i in rna_tokenized.ids]

    return {
        "input_ids": protein_tokenized.ids,
        "attention_mask": protein_tokenized.attention_mask,
        "labels": rna_labels,
    }

In [None]:
from copy import deepcopy



iterable_dataset = dataset.to_iterable_dataset()
# Filter dataset
# filter_protein_tokenizer = deepcopy(protein_tokenizer)
# filter_protein_tokenizer.tokenizer.no_truncation()
# filter_rna_tokenizer = deepcopy(rna_tokenizer)
# filter_rna_tokenizer.tokenizer.no_truncation()
# filtered = iterable_dataset.filter(lambda sample: (len(filter_protein_tokenizer.tokenize(sample['text'].strip().split('$')[0]).ids) <= 2048 and len(filter_rna_tokenizer.tokenize(sample['text'].strip().split('$')[1]).ids) <= 2048))
# Shuffle dataset
shuffled = iterable_dataset.shuffle(buffer_size = 10000)
# Tokenize dataset
tokenized = dataset.map(lambda sample: tokenize_dataset(sample, protein_tokenizer, rna_tokenizer))

In [None]:
iter_dt = iter(tokenized)


In [None]:
temp = next(iter_dt)
print(temp['input_ids'])

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Config, T5ForConditionalGeneration


configuration = T5Config(
                vocab_size=1000,
                bos_token_id=1, 
                decoder_start_token_id=0)
model = AutoModelForSeq2SeqLM.from_config(configuration)

In [None]:
from src.models import get_t5_model
model = get_t5_model(args)

args.model_size = sum(p.numel() for p in model.parameters())
print("Model Size: ", sum(p.numel() for p in model.parameters()))
print(model)

# Saving the configs
args_dict = vars(args)
with open(args.results_dir + '/Main Config.json', 'w') as json_file:
    json.dump(args_dict, json_file, indent=4)
print("Config saved to ", args.results_dir)

In [None]:
source_lang = "prot"
target_lang = "rna"
from transformers import TrainingArguments, Trainer
batch_size = 16
# model_name = model_checkpoint.split("/")[-1]
train_args = TrainingArguments(
    f"train-{source_lang}-to-{target_lang}",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    max_steps=4000,
    # predict_with_generate=True,
    fp16=True,
    logging_steps=100,
    do_eval=True,
    # resume_from_checkpoint='/data6/sobhan/rllm/train-prot-to-rna/checkpoint-3000'
    # push_to_hub=True,
)
trainer = Trainer(
    model,
    train_args,
    train_dataset=tokenized,
    eval_dataset=tokenized,
    # compute_metrics=compute_metrics
)
trainer.train()

In [None]:
model = model.from_pretrained("/data6/sobhan/rllm/train2-prot-to-rna/checkpoint-4000")

In [None]:
temp_data = iter(tokenized)


In [None]:
temp = next(temp_data)
protein_tokenizer.decode(temp['input_ids'])

In [None]:
print(temp['labels'])

In [None]:
rna_tokenizer.decode(temp['labels'])

In [None]:
model.generate(torch.tensor(temp['input_ids']).unsqueeze(0).to(model.device), max_length=100)

In [None]:
rna_tokenizer.decode(model.generate(torch.tensor(temp['input_ids']).unsqueeze(0).to(model.device))[0])

# Seperator

In [None]:
temp = tokenize_dataset(books[1], english_tokenizer=english_tokenizer, french_tokenizer=french_tokenizer)

In [None]:
len(temp['labels']), len(temp['decoder_input_ids'])


In [None]:
tokenized = books.map(lambda sample: tokenize_dataset(sample, english_tokenizer, french_tokenizer))

In [None]:
tokenized[0]

In [None]:
data_temp = next(iter(tokenized))
len(data_temp["input_ids"]),len(data_temp["decoder_input_ids"]),len(data_temp["labels"])

In [None]:
model = get_model(args=args)

args.model_size = sum(p.numel() for p in model.parameters())
print("Model Size: ", sum(p.numel() for p in model.parameters()))
print(model)

# Saving the configs
args_dict = vars(args)
with open(args.results_dir + '/Main Config.json', 'w') as json_file:
    json.dump(args_dict, json_file, indent=4)
print("Config saved to ", args.results_dir)

In [None]:
%load_ext autoreload
%autoreload 2
import os
# args.device = 'cuda:0'
# model.device = 'cuda:0'

if not args.sanity_check:
    train(args=args, wandb=wandb, model=model, train_dataset=tokenized, eval_dataset=tokenized, enc_tokenizer=english_tokenizer, dec_tokenizer=french_tokenizer)

print("============================================================================================")
end_time = datetime.now(pytz.timezone('Turkey')).strftime("%Y-%m-%d %H:%M")
print("Finished training at : ", end_time)
print("============================================================================================")


AAaaaaaaaaa

  from .autonotebook import tqdm as notebook_tqdm
