In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


import argparse
import os
import wandb
from datetime import datetime
import pytz
import time
import json



from src.models import get_model
from src.utils.helpers import set_hyps
from src.utils.tokenizer import get_tokenizer, BpeTokenizer
from src.data import get_datasets
from train import train
from generate import generate

import torch

def parse_opt():
    ################################################################ Arguments

    parser = argparse.ArgumentParser(description='Multilingual RNA Implementation')

    # Trainig Configuration
    parser.add_argument('--train-hyp', default="/data6/sobhan/rllm/hyps/train.yaml", type=str, help='Training Arguments hyperprameters')
    parser.add_argument('--model-hyp', default="/data6/sobhan/rllm/hyps/t5.yaml", type=str, help='Model hyperprameters')

    # utils
    parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)')

    # action
    parser.add_argument('--train', default=True, type=bool, help='train or generate')

    # Generation Configurations
    parser.add_argument('--data', default="/data6/sobhan/rllm/dataset/rph/test_rp.txt", type=str, help='Fasta File Path')
    parser.add_argument('--log', default=False, type=bool, help='Log the RNAs or Not')

    parser.add_argument('--results-dir', default='./results', type=str, metavar='PATH', help='path to cache (default: none)')

    # args = parser.parse_args()  # running in command line
    args = parser.parse_args('')  # running in ipynb

    args.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    return args


def main(args:object, wandb)->None:
    # track total training time
    start_time = datetime.now(pytz.timezone('Turkey')).strftime("%Y-%m-%d %H:%M")
    args.start_time = start_time
    if args.train:
        print("============================================================================================")
        print("Started training at : ", start_time)
        print("============================================================================================")
    else:
        print("============================================================================================")
        print("Started generating at : ", start_time)
        print("============================================================================================")

    # Handle Training Arguments
    args = set_hyps(args.train_hyp, args)
    args = set_hyps(args.model_hyp, args)
    args.results_dir = os.path.join(args.results_dir, args.model)
    if not os.path.exists(args.results_dir):
        os.makedirs(args.results_dir)    
    args.results_dir = os.path.join(args.results_dir, "run"+str(len(os.listdir(args.results_dir)))+"_"+time.strftime("%Y%m%d-%H%M%S"))
    os.makedirs(args.results_dir)
    
    plots_dir = args.results_dir+'/plots'
    os.mkdir(plots_dir)

    # Load the main components
    # protein_tokenizer = get_tokenizer(tokenizer_name="bpe", vocab_size=1000, seq_size=128, tokenizer_path=args.protein_tokenizer)
    # rna_tokenizer = get_tokenizer(tokenizer_name="bpe", vocab_size=1000, seq_size=128, tokenizer_path=args.rna_tokenizer)

    from datasets import load_dataset
    dataset = load_dataset("text", data_files=args.train_data, split="train[:100]")
    protein_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
    protein_tokenizer.train_tokenizer(train_data=dataset)
    protein_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "prrrr_{}_{}".format(1000, 128))

    rna_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
    rna_tokenizer.train_tokenizer(train_data=dataset, which=False)
    rna_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "rnaaaa_{}_{}".format(1000, 128))
    
    train_dataset, eval_dataset = get_datasets(args, protein_tokenizer=protein_tokenizer, rna_tokenizer=rna_tokenizer)

    model = get_model(args=args)

    args.model_size = sum(p.numel() for p in model.parameters())
    print("Model Size: ", sum(p.numel() for p in model.parameters()))
    print(model)

    # Saving the configs
    args_dict = vars(args)
    with open(args.results_dir + '/Main Config.json', 'w') as json_file:
        json.dump(args_dict, json_file, indent=4)
    print("Config saved to ", args.results_dir)

    # trainer = CustomTrainer(args=args, wandb=wandb, model=model, train_dataset=train_dataset, eval_dataset=eval_dataset, enc_tokenizer=protein_tokenizer, dec_tokenizer=rna_tokenizer)
    # trainer.train()
    # if args.train:
    #     # model = train(args=args, wandb=wandb, model=model, train_dataset=train_dataset, eval_dataset=eval_dataset, enc_tokenizer=protein_tokenizer, dec_tokenizer=rna_tokenizer)
    # else:
    #     generate(args=args)
    end_time = datetime.now(pytz.timezone('Turkey')).strftime("%Y-%m-%d %H:%M")

    if args.train:
        print("============================================================================================")
        print("Finished training at : ", end_time)
        print("============================================================================================")
    else:
        print("============================================================================================")
        print("Finished generating at : ", end_time)
        print("============================================================================================")

    return model


# # if __name__=="__main__":
#     os.environ['WANDB_DISABLED'] = 'true'
#     # Log in to your W&B account
#     # wandb.login()
#     # wandb.init(project="RNA-LLM")
#     # os.environ["WANDB_LOG_MODEL"] = "checkpoint"
#     # os.environ["WANDB_WATCH"] = "all"

#     args=parse_opt()
#     main(args, wandb)
#     wandb.finish()




  from .autonotebook import tqdm as notebook_tqdm


In [3]:

os.environ['WANDB_DISABLED'] = 'true'
# Log in to your W&B account
# wandb.login()
# wandb.init(project="RNA-LLM")
# os.environ["WANDB_LOG_MODEL"] = "checkpoint"
# os.environ["WANDB_WATCH"] = "all"

args=parse_opt()
model = main(args, wandb)
wandb.finish()

Started training at :  2024-08-06 14:47
enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead



enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead



Model Size:  44569088
T5ForConditionalGeneration(
  (shared): Embedding(1000, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(1000, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF

In [4]:
model = get_model(args=args)

In [5]:
model = model.from_pretrained("/data6/sobhan/rllm/results/train/t5/run1_20240806-082437/checkpoint-10000")

In [6]:
protein_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
protein_tokenizer.load("/data6/sobhan/rllm/dataset/tokenizers/prrrr_1000_128.json")

rna_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=128)
rna_tokenizer.load("/data6/sobhan/rllm/dataset/tokenizers/rnaaaa_1000_128.json")


train_dataset, eval_dataset = get_datasets(args, protein_tokenizer=protein_tokenizer, rna_tokenizer=rna_tokenizer)

enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead
enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead


In [7]:
temp_data = iter(train_dataset)

In [13]:
temp = next(temp_data)
protein_tokenizer.decode(temp['input_ids'])

'ma eki le k ldv ld kqa ei ila rr t ki nr lq se gr ktt ma i plt f df qle f eea lat sa ska i ski ke dk sc si t ksk m hv sf kc epe pr k sn f ek sn lr pf fi qt nv kn ke se sta qi ekk pr kp ld sv gll eg dr n kr kk sp q mn df ni ke nk svr ny qls ky r sv rkk sll p lc f ed e lkn ph a ki v nv spt kt vt sh m eq kd tn pi if h dt ey vr mll lt kn rf ssh ple ne ni y ph kr tn fi ler nc ei lk si ig'

In [14]:
temp['input_ids']

[128,
 387,
 29,
 11,
 355,
 40,
 778,
 93,
 510,
 52,
 19,
 75,
 131,
 31,
 56,
 76,
 651,
 128,
 10,
 407,
 7,
 165,
 335,
 7,
 361,
 572,
 50,
 410,
 10,
 797,
 35,
 225,
 152,
 77,
 19,
 780,
 13,
 209,
 84,
 205,
 856,
 61,
 11,
 100,
 7,
 141,
 100,
 32,
 157,
 121,
 96,
 105,
 89,
 35,
 56,
 723,
 116,
 506,
 61,
 343,
 40,
 49,
 448,
 68,
 127,
 14,
 66,
 30,
 111,
 16,
 202,
 165,
 79,
 35,
 238,
 643,
 254,
 334,
 130,
 17,
 49,
 519,
 198,
 15,
 92,
 7,
 46,
 6,
 402,
 125,
 3,
 75,
 20,
 105,
 359,
 67,
 140,
 113,
 13,
 183,
 58,
 360,
 117,
 185,
 9,
 114,
 153,
 143,
 883,
 42,
 89,
 285,
 630,
 784,
 237,
 79,
 22,
 125,
 66,
 360,
 121,
 363,
 236,
 93,
 41,
 77,
 151,
 1]

In [15]:
rna_tokenizer.decode(temp['labels'])

'j z bzzjjjj zzzj uzjj zzjzj zzzjzjj zjzzj bjz ujjbj bzzj uzzzuj zjz bzz uzzjz bujzj uzbz bzzjbzz bzzzjj z bzzj zzzjj uzzz bzzj zzjjj zz bzz uzzzzj zjjzj uzzzzjj zzjjj zj bj zzzj uujbuj uzbz zzjzj bjzz jjj zjz bjjz bzjz uzj zzbj uzzjj ujjj bbzjz uzzzzj zbj bbzj uzjjzz bzjuzj bb uzzzjj zjzzj uzzzz buuj uuzuu uzzjj ujj uzzzj zjj zuz uzzj uzjuj bzjuj zzzjj bjzz bzbzz bzj bzjjjjj zz bzbj bzjzzj bjbz bzz uzjuuj uzjj bzz uubz uzz uubj uuzz buz buuz ubjj uzuz buzuuz bbbjj buuz bubz buuubb ubbj ubj bzzj uuuujjj bbzbj ubj bbbb bjuuj uuz bbbz uzbj ubj uu ubzz ubuu bj bbzz ubb bjj uzbbz bzzjj zuz ujjuuuj uzzbb uzbj buj bzjj bzuz uzzbbz bbzzz bjuz uzjuzuj bbbjj buzuj bb'

In [16]:
print(temp['labels'])

[4, 6, 599, 73, 41, 220, 898, 316, 60, 228, 42, 797, 166, 24, 309, 323, 74, 556, 221, 6, 42, 173, 35, 42, 182, 12, 24, 233, 393, 474, 182, 19, 14, 73, 955, 74, 220, 140, 20, 166, 157, 133, 18, 190, 76, 44, 434, 233, 100, 46, 553, 212, 11, 153, 316, 127, 63, 366, 76, 26, 91, 36, 110, 34, 122, 120, 173, 140, 183, 22, 634, 12, 87, 282, 61, 24, 265, 41, 24, 222, 15, 143, 67, 25, 104, 71, 47, 626, 305, 104, 144, 297, 97, 33, 42, 800, 197, 33, 40, 259, 31, 56, 84, 33, 10, 115, 54, 14, 50, 32, 23, 101, 80, 110, 468, 184, 84, 29, 37, 117, 273, 210, 124, 941, 305, 170, 11, 1]


In [17]:
model.generate(torch.tensor(temp['input_ids']).unsqueeze(0).to(model.device), max_length=100)

tensor([[  0,   4,   6, 599,  73,  41, 220, 898, 316,  60, 228,  42, 797, 166,
          24, 309, 323,  74, 556, 221,   6,  42, 173,  35,  42, 182,  12,  24,
         233, 393, 474, 182,  19,  14,  73, 955,  74, 220, 140,  20, 166, 157,
         133,  18, 190,  76,  44, 434, 233, 100,  46, 553, 212,  11, 153, 316,
         127,  63, 366,  76,  26,  91,  36, 110,  34, 122, 120, 173, 140, 183,
          22, 634,  12,  87, 282,  61,  24, 265,  41,  24, 222,  15, 143,  67,
          25, 104,  71,  47, 626, 305, 104, 144, 297,  97,  33,  42, 800, 197,
          33,  40]])

In [28]:
rna_tokenizer.decode(model.generate(torch.tensor(temp['input_ids']).unsqueeze(0).to(model.device))[0])



'z uzzjj bzzj uujzj bzzz buuzzj zjujj zzz uzjz ujj zjz ujj zzjz uzz uzbj zj bzj bjzuj zbj'

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import argparse
import os
import wandb
from datetime import datetime
import pytz
import time
import json

from src.models import get_model
from src.utils.helpers import set_hyps
from src.utils.tokenizer import get_tokenizer, BpeTokenizer
from src.data import get_datasets
from train import train
from generate import generate

import torch

def parse_opt():
    ################################################################ Arguments

    parser = argparse.ArgumentParser(description='Multilingual RNA Implementation')

    # action
    parser.add_argument('--train', default=True, type=bool, help='Train or Generate')

    # Trainig Configuration
    parser.add_argument('--train-hyp', default="/data6/sobhan/rllm/hyps/train.yaml", type=str, help='Training Arguments hyperprameters')
    parser.add_argument('--model-hyp', default="/data6/sobhan/rllm/hyps/t5.yaml", type=str, help='Model hyperprameters')

    # Generation Configurations
    parser.add_argument('--checkpoints', default=None, type=str, help='Load Model')


    parser.add_argument('--results-dir', default='./results', type=str, metavar='PATH', help='path to cache (default: none)')

    # args = parser.parse_args()  # running in command line
    args = parser.parse_args('')  # running in ipynb

    args.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    return args

args=parse_opt()
# track total training time
start_time = datetime.now(pytz.timezone('Turkey')).strftime("%Y-%m-%d %H:%M")
args.start_time = start_time
if args.train:
    print("============================================================================================")
    print("Started training at : ", start_time)
    print("============================================================================================")
else:
    print("============================================================================================")
    print("Started generating at : ", start_time)
    print("============================================================================================")

# Handle Training Arguments
args = set_hyps(args.train_hyp, args)
args = set_hyps(args.model_hyp, args)
if args.train: args.results_dir = os.path.join(args.results_dir, "train")
else: args.results_dir = os.path.join(args.results_dir, "inference")
args.results_dir = os.path.join(args.results_dir, args.model)
if not os.path.exists(args.results_dir):
    os.makedirs(args.results_dir)    
args.results_dir = os.path.join(args.results_dir, "run"+str(len(os.listdir(args.results_dir)))+"_"+time.strftime("%Y%m%d-%H%M%S"))
os.makedirs(args.results_dir)

plots_dir = args.results_dir+'/plots'
os.mkdir(plots_dir)

# Load the main components
# protein_tokenizer = get_tokenizer(tokenizer_name="bpe", vocab_size=1000, seq_size=2048, tokenizer_path=args.protein_tokenizer)
# rna_tokenizer = get_tokenizer(tokenizer_name="bpe", vocab_size=1000, seq_size=2048, tokenizer_path=args.rna_tokenizer)

from datasets import load_dataset
dataset = load_dataset("text", data_files=args.train_data, split="train[:100]")
protein_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=2048)
protein_tokenizer.train_tokenizer(train_data=dataset)
protein_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "p_{}_{}".format(1000, 2048))

rna_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=2048)
rna_tokenizer.train_tokenizer(train_data=dataset, which=False)
rna_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "r_{}_{}".format(1000, 2048))


train_dataset, eval_dataset = get_datasets(args, protein_tokenizer=protein_tokenizer, rna_tokenizer=rna_tokenizer)

model = get_model(args=args)

args.model_size = sum(p.numel() for p in model.parameters())
print("Model Size: ", sum(p.numel() for p in model.parameters()))
print(model)

# Saving the configs
args_dict = vars(args)
with open(args.results_dir + '/Main Config.json', 'w') as json_file:
    json.dump(args_dict, json_file, indent=4)
print("Config saved to ", args.results_dir)


# if args.train:
    # train(args=args, wandb=wandb, model=model, train_dataset=train_dataset, eval_dataset=eval_dataset, enc_tokenizer=protein_tokenizer, dec_tokenizer=rna_tokenizer)
# else:
#     generate(args=args, eval_dataset=eval_dataset, model=model, dec_tokenizer=rna_tokenizer)

end_time = datetime.now(pytz.timezone('Turkey')).strftime("%Y-%m-%d %H:%M")
if args.train:
    print("============================================================================================")
    print("Finished training at : ", end_time)
    print("============================================================================================")
else:
    print("============================================================================================")
    print("Finished generating at : ", end_time)
    print("============================================================================================")



  from .autonotebook import tqdm as notebook_tqdm


Started training at :  2024-08-07 19:14
enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead



enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead



Model Size:  4431360896
T5ForConditionalGeneration(
  (shared): Embedding(1000, 2048)
  (encoder): T5Stack(
    (embed_tokens): Embedding(1000, 2048)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=2048, out_features=14336, bias=False)
              (k): Linear(in_features=2048, out_features=14336, bias=False)
              (v): Linear(in_features=2048, out_features=14336, bias=False)
              (o): Linear(in_features=14336, out_features=2048, bias=False)
              (relative_attention_bias): Embedding(32, 14)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
       

In [2]:
from datasets import load_dataset
dataset = load_dataset("text", data_files=args.train_data, split="train[:100]")
protein_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=2048)
protein_tokenizer.train_tokenizer(train_data=dataset)
protein_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "p_{}_{}".format(1000, 2048))

rna_tokenizer = BpeTokenizer(vocab_size=1000, seq_size=2048)
rna_tokenizer.train_tokenizer(train_data=dataset, which=False)
rna_tokenizer.save("/data6/sobhan/rllm/dataset/tokenizers", "r_{}_{}".format(1000, 2048))


train_dataset, eval_dataset = get_datasets(args, protein_tokenizer=protein_tokenizer, rna_tokenizer=rna_tokenizer)

enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead



enable_padding(max_length=X) is deprecated, use enable_padding(length=X) instead





In [3]:
temp_data = iter(train_dataset)

In [4]:
temp = next(temp_data)

In [5]:
input = torch.tensor(temp['input_ids']).unsqueeze(0).to(model.device)

In [6]:
input = torch.tensor([temp['input_ids'], temp['input_ids']]).to(model.device)

In [7]:
input.size()

torch.Size([2, 2048])

In [8]:
model(input_ids=input)

OutOfMemoryError: CUDA out of memory. Tried to allocate 448.00 MiB. GPU 0 has a total capacty of 23.64 GiB of which 90.50 MiB is free. Including non-PyTorch memory, this process has 23.55 GiB memory in use. Of the allocated memory 22.63 GiB is allocated by PyTorch, and 177.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [14]:
model.generate(input, max_length=100)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]], device='cuda:0')