<a href="https://colab.research.google.com/github/liangli217/LLM_learning/blob/main/LLM_for_genomics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# DOWNLOAD Mistral-DNA GIT REPO
!git clone https://github.com/raphaelmourad/Mistral-DNA.git

In [None]:
# WHEN USING GOOGLE COLLAB
!pip install datasets==3.0.1
!pip install flash-attn

In [None]:
# CHECK GPU
# We can see how many VRAM is used and how much the GPU is used.
!nvidia-smi

In [None]:
# IMPORT LIBRARIES
import os

import torch
import numpy as np

from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoConfig
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset

In [None]:
# SET DIRECTORY
os.chdir("Mistral-DNA/")
print(os.getcwd())

In [None]:
!ls

In [None]:
# CHOOSE THE LLM ARCHITECTURE
# To do during class:
# - look at the original archicture of Mixtral-8x7B-v0.1, discuss the model
# - change the model architecture by adding or removing transformer blocks, hidden states, number of attention heads, and number of experts
# - test BERT model architecture?
# NB: flash attention 2 does not work with T4 GPU
config = AutoConfig.from_pretrained("data/models/Mixtral-8x7B-v0.1") # Mixture of expert
#model = AutoModelForCausalLM.from_config(config,attn_implementation="flash_attention_2")
model = AutoModelForCausalLM.from_config(config,attn_implementation="eager")
model

MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(4096, 256)
    (layers): ModuleList(
      (0-7): 8 x MixtralDecoderLayer(
        (self_attn): MixtralAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=256, bias=False)
          (v_proj): Linear(in_features=256, out_features=256, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear(in_features=256, out_features=64, bias=False)
          (experts): ModuleList(
            (0-63): 64 x MixtralBlockSparseTop2MLP(
              (w1): Linear(in_features=256, out_features=256, bias=False)
              (w2): Linear(in_features=256, out_features=256, bias=False)
              (w3): Linear(in_features=256, out_features=256, bias=False)
              (act_fn): SiLU()
            )
          )
        )
        (

In [None]:
# LOAD BPE LETTER TOKENIZER
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
tokenizer.padding_side  = 'left'
print(tokenizer)

In [None]:
# DNA encoding
encoidng = tokenizer("ATTGTGGGTCCCCC", padding = "longest", truncation = True, return_tensors = "pt")
print(encoidng)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[   1, 2061,  281,  485,    6,    2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
print(f"Model size: {pytorch_total_params/1000**2:.1f}M parameters")

Model size: 105.0M parameters


In [None]:
# Load data
data_collator = DataCollatorForLanguageModeling(tokenizer= tokenizer, mlm = False)
dataset_text = load_dataset("csv", data_files="data/genome_sequences/hg38/sequences_hg38_200b_verysmall.csv.gz")


In [None]:
tokenized_data = tokenizer(dataset_text['train']['text'], padding="longest", truncation=True, return_tensors="pt")


In [None]:
dataset_text['train']

Dataset({
    features: ['text'],
    num_rows: 99999
})

In [None]:
dataset_text

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 99999
    })
})

In [None]:
# TOKENIZE DATA
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="longest", truncation=True, return_tensors="pt")

dataset = dataset_text.map(tokenize_function, batched=True)
# print(dataset["train"])

In [None]:
train_size = int(0.8 * len(dataset["train"]))
test_size = len(dataset["train"]) - train_size
train_set, val_set = torch.utils.data.random_split(dataset['train'], [train_size, test_size])

FP16 vs BF16

In [None]:
# Parameters for Pretraining
batchsize = 32
training_args = TrainingArguments(
    output_dir = './results/models',
    eval_strategy ='epoch',
    save_strategy = 'epoch',
    num_train_epochs = 10,
    per_device_train_batch_size = batchsize,
    per_device_eval_batch_size = batchsize,
    learning_rate = 5e-4,
    weight_decay = 0.01,
    logging_dir = './logs',
    load_best_model_at_end = True,
    bf16 = True,
    gradient_accumulation_steps = 50,
)
import os
os.environ["WANDB_DISABLED"] = "true"

print(training_args)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False,

In [None]:
!pip install wandb

In [None]:
# Pretrain Model
trainer = Trainer(
    model = model,
    args=training_args,
    data_collator = data_collator,
    train_dataset = train_set,
    eval_dataset = val_set,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

print('Starting a trainer...')
# Start training
trainer.train()