In [1]:

import os
import numpy as np
import pandas as pd


from torch.utils.data import Dataset
from peft import (
    LoraConfig, 
    get_peft_model, 
)

from functools import partial
from typing import Any, Dict, List

import torch
import wandb
from datasets import DatasetDict, load_dataset
from omegaconf import DictConfig

from torch import nn

from transformers import (
    LlamaTokenizer, 
    LlamaForSequenceClassification,
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback,
    TrainerCallback,
)

from structllm.models.utils import CustomWandbCallback_FineTune, EvaluateFirstStepCallback


2024-04-22 10:34:04.416479: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 10:34:04.416691: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 10:34:04.491738: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-22 10:34:04.646810: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"


In [7]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict, 
    llama_tokenizer, 
    model,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = llama_tokenizer.add_special_tokens(special_tokens_dict)
    llama_tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(llama_tokenizer),pad_to_multiple_of=8)

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg

    model.config.pad_token_id = llama_tokenizer.pad_token_id

In [8]:
pretrained_ckpt = "meta-llama/Llama-2-7b-hf"

llama_tokenizer = LlamaTokenizer.from_pretrained(
    pretrained_ckpt,
    model_max_length=MAX_LENGTH,
    padding_side="right",
    use_fast=False,
    )

In [9]:
from transformers import BitsAndBytesConfig
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False




# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 4
# Batch size per GPU for evaluation
per_device_eval_batch_size = 4
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [11]:
device_map = {"": 0}
model = LlamaForSequenceClassification.from_pretrained(pretrained_ckpt,
                                                            num_labels=1,
                                                            quantization_config=bnb_config,
                                                            device_map=device_map
                                                      )
        
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="TOKEN_CLS",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
        

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,198,400 || all params: 6,611,546,112 || trainable%: 0.0635010318143267


In [12]:
special_tokens_dict = dict()
if llama_tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if llama_tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if llama_tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if llama_tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

print(special_tokens_dict)

smart_tokenizer_and_embedding_resize(
            special_tokens_dict=special_tokens_dict,
            llama_tokenizer=llama_tokenizer,
            model=model,
        )

{'pad_token': '[PAD]'}


In [13]:
len(llama_tokenizer)

32001

In [14]:
def _tokenize(examples):
    # Tokenize the 'crystal_llm' column using the LAMA tokenizer
    tokenized_examples = llama_tokenizer(examples["crystal_llm_rep"],truncation=True, padding=True,)
    return tokenized_examples

def _prepare_datasets(path: str) -> DatasetDict:
        """
        Prepare training and validation datasets.

        Args:
            train_df (pd.DataFrame): DataFrame containing training data.

        Returns:
            DatasetDict: Dictionary containing training and validation datasets.
        """

        ds = load_dataset("json", data_files=path,split="train")
        dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
        return dataset.map(_tokenize, batched=True)

In [15]:
dataset = _prepare_datasets("/work/so87pot/material_db/matbench_sml/train_matbench_log_kvrh_0.json")

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['cif_p1', 'cif_symmetrized', 'cif_bonding', 'slice', 'composition', 'crystal_llm_rep', 'robocrys_rep', 'wycoff_rep', 'mbid', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 4858
    })
    test: Dataset({
        features: ['cif_p1', 'cif_symmetrized', 'cif_bonding', 'slice', 'composition', 'crystal_llm_rep', 'robocrys_rep', 'wycoff_rep', 'mbid', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1215
    })
})

In [17]:
early_stopping = True
custom_logger = True
def _callbacks(self) -> List[TrainerCallback]:
        """Returns a list of callbacks for early stopping, and custom logging."""
        callbacks = []

        if early_stopping:
            callbacks.append(EarlyStoppingCallback(
                early_stopping_patience=self.callbacks.early_stopping_patience,
                early_stopping_threshold=self.callbacks.early_stopping_threshold
            ))

        if custom_logger:
            callbacks.append(CustomWandbCallback_FineTune())

        callbacks.append(EvaluateFirstStepCallback)

        return callbacks

In [18]:
def _compute_metrics(p: Any, eval=True) -> Dict[str, float]:
        preds = torch.tensor(p.predictions.squeeze())  # Convert predictions to PyTorch tensor
        label_ids = torch.tensor(p.label_ids)  # Convert label_ids to PyTorch tensor

        if eval:
            # Calculate RMSE as evaluation metric
            eval_rmse = torch.sqrt(((preds - label_ids) ** 2).mean()).item()
            return {"eval_rmse": round(eval_rmse, 3)}
        else:
            # Calculate RMSE as training metric
            loss = torch.sqrt(((preds - label_ids) ** 2).mean()).item()
            return {"train_rmse": round(loss, 3), "loss": round(loss, 3)}

In [19]:
# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

In [21]:
#os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
training_args = TrainingArguments(
    fp16=False,
    bf16=True,
    metric_for_best_model="eval_rmse",  # Metric to use for determining the best model
    greater_is_better=False,  # Lower eval_rmse is better
    dataloader_num_workers=2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    weight_decay=weight_decay,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    #group_by_length=group_by_length,
    output_dir = "/home/so87pot/n0w0f/structllm/src/structllm/llama_out",
    overwrite_output_dir= True,
    num_train_epochs= 2,
    per_device_train_batch_size= 8,
    save_strategy= "steps",
    do_eval = True,
    evaluation_strategy= 'steps' ,  
    logging_strategy= 'steps',
    logging_first_step= True,
    save_steps= 5 ,# Number of epochs before saving
    report_to= "wandb",
    save_total_limit= 5,
    learning_rate= 5e-4,
    logging_steps= 5 ,
    eval_steps= 5,
    seed= 42,
    load_best_model_at_end= True,
)


In [22]:
trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=None,
            compute_metrics=_compute_metrics,
            tokenizer=llama_tokenizer,
            train_dataset=dataset['train'],
            eval_dataset=dataset['test'],
        )

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpvt-nawaf[0m ([33mlama-lab[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Rmse
5,3.6624,1.01041,1.005
10,0.8814,0.636976,0.798
15,0.5843,0.404698,0.636
20,0.3749,0.537788,0.733
25,0.4349,0.58957,0.768
30,0.3453,0.431164,0.657
35,0.3473,0.79749,0.893
40,0.476,0.557375,0.747
45,0.5178,0.2075,0.456
50,0.4194,1.75657,1.325


Checkpoint destination directory /home/so87pot/n0w0f/structllm/src/structllm/llama_out/checkpoint-5 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/so87pot/n0w0f/structllm/src/structllm/llama_out/checkpoint-15 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/so87pot/n0w0f/structllm/src/structllm/llama_out/checkpoint-20 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/so87pot/n0w0f/structllm/src/structllm/llama_out/checkpoint-25 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory /home/so87pot/n0w0f/structllm/src/structllm/llama_out/checkpoint-30 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [33]:
!nvidia-smi

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x153f6afbea90>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 153f6a39c490, raw_cell="!nvidia-smi" store_history=True silent=False shell_futures=True cell_id=88a1607b-a208-4127-aaf9-848927bbda5a>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

Mon Apr 22 10:26:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB           On | 00000000:81:00.0 Off |                    0 |
| N/A   28C    P0               74W / 500W|  11885MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [None]:
!sudo kill -9 3407242

In [None]:
Rasheeda*0
