In [1]:
import os
from functools import partial
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import torch
import wandb
from datasets import DatasetDict, load_dataset
from omegaconf import DictConfig
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from torch import nn
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    LlamaForSequenceClassification,
    #LlamaTokenizer,
    Trainer,
    TrainerCallback,
    TrainingArguments,
)
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer

from mattext.models.utils import (
    CustomWandbCallback_FineTune,
    EvaluateFirstStepCallback,
)


2024-05-16 02:18:43.969920: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-16 02:18:43.970118: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-16 02:18:44.028210: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-16 02:18:44.156164: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [25]:

IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
use_flash_attention = True

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict,
    llama_tokenizer,
    model,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = llama_tokenizer.add_special_tokens(special_tokens_dict)
    llama_tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(llama_tokenizer), pad_to_multiple_of=8)

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True
        )
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg

    model.config.pad_token_id = llama_tokenizer.pad_token_id
    output_embeddings[-num_new_tokens:] = output_embeddings_avg




def _setup_model_tokenizer() -> None:
    tokenizer = AutoTokenizer.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        model_max_length=MAX_LENGTH,
        padding_side="right",
        use_fast=False,
    )
    #tokenizer.pad_token = tokenizer.eos_token


    bnb_config = BitsAndBytesConfig(load_in_4bit=True, 
                                    bnb_4bit_use_double_quant=True, 
                                    bnb_4bit_quant_type="nf4", 
                                    bnb_4bit_compute_dtype=torch.bfloat16)

    device_map = "auto" 
    model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        use_cache=False,
        use_flash_attention_2=use_flash_attention,
        quantization_config=bnb_config,
        device_map=device_map,
    )

    peft_config = LoraConfig(
                    lora_alpha=16,
                    lora_dropout=0.1,
                    r=64,
                    bias="none",
                    task_type="CAUSAL_LM",)
    
    #model = prepare_model_for_kbit_training(model)
    #model = get_peft_model(model, peft_config)

    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=special_tokens_dict,
        llama_tokenizer=tokenizer,
        model=model,
    )


    print(len(tokenizer))
    return model, tokenizer, peft_config



Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd8a956d30, raw_cell="
IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD.." store_history=True silent=False shell_futures=True cell_id=18242c15-0617-4a8c-9a14-8f1cdc70ebcd>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd56c1fdc0, execution_count=25 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd8a956d30, raw_cell="
IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD.." store_history=True silent=False shell_futures=True cell_id=18242c15-0617-4a8c-9a14-8f1cdc70ebcd> result=None>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [29]:
model

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd19952340, raw_cell="model" store_history=True silent=False shell_futures=True cell_id=16a2197d-3599-4011-87b2-66b9beae9179>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32008, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): L

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd199521f0, execution_count=29 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd19952340, raw_cell="model" store_history=True silent=False shell_futures=True cell_id=16a2197d-3599-4011-87b2-66b9beae9179> result=LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32008, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
  

TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [28]:
model, tokenizer,peft_config = _setup_model_tokenizer()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd56b2fbb0, raw_cell="model, tokenizer,peft_config = _setup_model_tokeni.." store_history=True silent=False shell_futures=True cell_id=caf4527d-110d-4726-bab5-8bb5389588fd>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

32001
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd56b2faf0, execution_count=28 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd56b2fbb0, raw_cell="model, tokenizer,peft_config = _setup_model_tokeni.." store_history=True silent=False shell_futures=True cell_id=caf4527d-110d-4726-bab5-8bb5389588fd> result=None>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [4]:
ds = load_dataset("json", data_files="/work/so87pot/material_db/all_1/train_matbench_log_gvrh_0.json", split="train")


In [5]:
from random import randrange
print(f"dataset size: {len(ds)}")
#print(ds[randrange(len(ds))])


dataset size: 6073


In [6]:
ds

Dataset({
    features: ['atoms', 'crystal_llm_rep', 'atoms_params', 'wycoff_rep', 'mbid', 'cif_symmetrized', 'zmatrix', 'slice', 'cif_p1', 'composition', 'labels'],
    num_rows: 6073
})

In [40]:
def format_instruction(sample):
	return [f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM.
 
### Input:
{sample['composition']}
 
The response is ### Response:
{sample['labels']}"""]



def template_dataset(sample):
    sample["text"] = f"{format_instruction(sample)}"
    return sample

dataset = ds.map(template_dataset, remove_columns=list(ds.features))

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd56baf970, raw_cell="def format_instruction(sample):
	return [f"""### I.." store_history=True silent=False shell_futures=True cell_id=4dad3206-c3ee-4469-b3a8-2c8d22352ef9>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

Map:   0%|          | 0/6073 [00:00<?, ? examples/s]

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd56baf250, execution_count=40 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd56baf970, raw_cell="def format_instruction(sample):
	return [f"""### I.." store_history=True silent=False shell_futures=True cell_id=4dad3206-c3ee-4469-b3a8-2c8d22352ef9> result=None>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [41]:
response_template_with_context = "response is ### Response"# "\n### Response:"  # We added context here: "\n". This is enough for this tokenizer
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`
print(response_template_with_context,response_template_ids)

data_collator = DataCollatorForCompletionOnlyLM(response_template_ids,tokenizer=tokenizer)

args = TrainingArguments(
    output_dir="try-llama",
    num_train_epochs=3,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True # disable tqdm since with packing values are in correct
)

from trl import SFTTrainer
 
max_seq_length = 2048 # max sequence length for model and packing of the dataset
 
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=False,
    data_collator=data_collator,
    dataset_text_field="text",
    #formatting_func=format_instruction,
    args=args,
)



Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd5676b880, raw_cell="response_template_with_context = "response is ### .." store_history=True silent=False shell_futures=True cell_id=15aa85a1-0ba1-4925-9482-4388109f071e>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

response is ### Response [835, 13291]


Map:   0%|          | 0/6073 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd19d5ec10, execution_count=41 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd5676b880, raw_cell="response_template_with_context = "response is ### .." store_history=True silent=False shell_futures=True cell_id=15aa85a1-0ba1-4925-9482-4388109f071e> result=None>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [42]:
dataset

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd185c13d0, raw_cell="dataset" store_history=True silent=False shell_futures=True cell_id=c4c54eaa-8633-4cb8-9b84-f5aeeb9a71cb>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

Dataset({
    features: ['text'],
    num_rows: 6073
})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd185c1850, execution_count=42 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd185c13d0, raw_cell="dataset" store_history=True silent=False shell_futures=True cell_id=c4c54eaa-8633-4cb8-9b84-f5aeeb9a71cb> result=Dataset({
    features: ['text'],
    num_rows: 6073
})>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [36]:
tokenizer.encode("\n\n### Response:\n1.724275869600789")

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd18620340, raw_cell="tokenizer.encode("\n\n### Response:\n1.72427586960.." store_history=True silent=False shell_futures=True cell_id=19f22e16-b70f-4317-9592-386d246e63a4>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

[1,
 29871,
 13,
 13,
 2277,
 29937,
 13291,
 29901,
 13,
 29896,
 29889,
 29955,
 29906,
 29946,
 29906,
 29955,
 29945,
 29947,
 29953,
 29929,
 29953,
 29900,
 29900,
 29955,
 29947,
 29929]

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd186200a0, execution_count=36 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd18620340, raw_cell="tokenizer.encode("\n\n### Response:\n1.72427586960.." store_history=True silent=False shell_futures=True cell_id=19f22e16-b70f-4317-9592-386d246e63a4> result=[1, 29871, 13, 13, 2277, 29937, 13291, 29901, 13, 29896, 29889, 29955, 29906, 29946, 29906, 29955, 29945, 29947, 29953, 29929, 29953, 29900, 29900, 29955, 29947, 29929]>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [43]:
trainer.train()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 14fd185c12e0, raw_cell="trainer.train()" store_history=True silent=False shell_futures=True cell_id=6efb2f0f-b870-434f-a1ed-efdc1a8be158>,),kwargs {}:


TypeError: _resume_backend() takes 1 positional argument but 2 were given

{'loss': 2.5563, 'grad_norm': 0.3671875, 'learning_rate': 0.0002, 'epoch': 0.02}
{'loss': 1.9523, 'grad_norm': 1.9296875, 'learning_rate': 0.0002, 'epoch': 0.04}
{'loss': 1.8117, 'grad_norm': 0.142578125, 'learning_rate': 0.0002, 'epoch': 0.06}
{'loss': 1.75, 'grad_norm': 0.2392578125, 'learning_rate': 0.0002, 'epoch': 0.08}
{'loss': 1.7561, 'grad_norm': 0.1708984375, 'learning_rate': 0.0002, 'epoch': 0.1}
{'loss': 1.6973, 'grad_norm': 0.3984375, 'learning_rate': 0.0002, 'epoch': 0.12}
{'loss': 1.6412, 'grad_norm': 0.44140625, 'learning_rate': 0.0002, 'epoch': 0.14}
{'loss': 1.5684, 'grad_norm': 0.875, 'learning_rate': 0.0002, 'epoch': 0.16}
{'loss': 1.4363, 'grad_norm': 0.8125, 'learning_rate': 0.0002, 'epoch': 0.18}
{'loss': 1.3175, 'grad_norm': 0.7890625, 'learning_rate': 0.0002, 'epoch': 0.2}
{'loss': 1.2526, 'grad_norm': 1.3515625, 'learning_rate': 0.0002, 'epoch': 0.22}
{'loss': 1.172, 'grad_norm': 1.5, 'learning_rate': 0.0002, 'epoch': 0.24}
{'loss': 1.1475, 'grad_norm': 1.77343



{'loss': 0.2769, 'grad_norm': 1.4921875, 'learning_rate': 0.0002, 'epoch': 1.01}
{'loss': 0.3086, 'grad_norm': 0.796875, 'learning_rate': 0.0002, 'epoch': 1.03}
{'loss': 0.3115, 'grad_norm': 0.5625, 'learning_rate': 0.0002, 'epoch': 1.05}
{'loss': 0.2499, 'grad_norm': 0.279296875, 'learning_rate': 0.0002, 'epoch': 1.07}
{'loss': 0.2893, 'grad_norm': 1.1484375, 'learning_rate': 0.0002, 'epoch': 1.09}
{'loss': 0.3045, 'grad_norm': 1.125, 'learning_rate': 0.0002, 'epoch': 1.11}
{'loss': 0.3333, 'grad_norm': 1.4453125, 'learning_rate': 0.0002, 'epoch': 1.13}
{'loss': 0.3477, 'grad_norm': 0.77734375, 'learning_rate': 0.0002, 'epoch': 1.15}
{'loss': 0.2807, 'grad_norm': 0.59765625, 'learning_rate': 0.0002, 'epoch': 1.16}
{'loss': 0.2679, 'grad_norm': 0.63671875, 'learning_rate': 0.0002, 'epoch': 1.18}
{'loss': 0.3278, 'grad_norm': 0.8125, 'learning_rate': 0.0002, 'epoch': 1.2}
{'loss': 0.2873, 'grad_norm': 0.53515625, 'learning_rate': 0.0002, 'epoch': 1.22}
{'loss': 0.2761, 'grad_norm': 0.76



{'loss': 0.2386, 'grad_norm': 1.265625, 'learning_rate': 0.0002, 'epoch': 2.01}
{'loss': 0.2511, 'grad_norm': 0.9609375, 'learning_rate': 0.0002, 'epoch': 2.03}
{'loss': 0.2579, 'grad_norm': 0.369140625, 'learning_rate': 0.0002, 'epoch': 2.05}
{'loss': 0.2284, 'grad_norm': 0.3671875, 'learning_rate': 0.0002, 'epoch': 2.07}
{'loss': 0.2642, 'grad_norm': 2.28125, 'learning_rate': 0.0002, 'epoch': 2.09}
{'loss': 0.2278, 'grad_norm': 0.59375, 'learning_rate': 0.0002, 'epoch': 2.11}
{'loss': 0.2171, 'grad_norm': 0.4140625, 'learning_rate': 0.0002, 'epoch': 2.13}
{'loss': 0.2396, 'grad_norm': 0.57421875, 'learning_rate': 0.0002, 'epoch': 2.15}
{'loss': 0.2173, 'grad_norm': 0.703125, 'learning_rate': 0.0002, 'epoch': 2.17}
{'loss': 0.2444, 'grad_norm': 0.2314453125, 'learning_rate': 0.0002, 'epoch': 2.19}
{'loss': 0.2482, 'grad_norm': 0.2236328125, 'learning_rate': 0.0002, 'epoch': 2.21}
{'loss': 0.2722, 'grad_norm': 0.2255859375, 'learning_rate': 0.0002, 'epoch': 2.23}
{'loss': 0.2288, 'grad



{'train_runtime': 1468.153, 'train_samples_per_second': 12.409, 'train_steps_per_second': 1.034, 'train_loss': 0.4176751161595421, 'epoch': 3.0}


TrainOutput(global_step=1518, training_loss=0.4176751161595421, metrics={'train_runtime': 1468.153, 'train_samples_per_second': 12.409, 'train_steps_per_second': 1.034, 'train_loss': 0.4176751161595421, 'epoch': 3.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x14fd8be22790>> (for post_run_cell), with arguments args (<ExecutionResult object at 14fd185c18e0, execution_count=43 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 14fd185c12e0, raw_cell="trainer.train()" store_history=True silent=False shell_futures=True cell_id=6efb2f0f-b870-434f-a1ed-efdc1a8be158> result=TrainOutput(global_step=1518, training_loss=0.4176751161595421, metrics={'train_runtime': 1468.153, 'train_samples_per_second': 12.409, 'train_steps_per_second': 1.034, 'train_loss': 0.4176751161595421, 'epoch': 3.0})>,),kwargs {}:


TypeError: _pause_backend() takes 1 positional argument but 2 were given

In [None]:
    def formatting_prompts_func(self,example):
        output_texts = []
        for i in range(len(example['labels'])):
            text = f"### Instruction: Below is a {self.material_} represented as string. Followed by a question. Write a response to the question.{example[self.representation]}Question: What is the {self.property_} of this material ? ### Answer: {round(float(example['labels'][i]),3)}"
            output_texts.append(text)
        return output_texts

### Response: {round(float(sample['labels']),3)}

    def format_qstns(self, sample):
        print(sample['labels'])
        return f"""### Instruction: Below is a {self.material_} represented as string. Followed by a question. Write a response to the question.{sample[self.representation]}Question: What is the {self.property_} of this material ?\n### Response: {round(float(sample['labels']),3)}"""
\

    def _prepare_datasets(self, path: str) -> DatasetDict:
        """
        Prepare training and validation datasets.

        Args:
            train_df (pd.DataFrame): DataFrame containing training data.

        Returns:
            DatasetDict: Dictionary containing training and validation datasets.
        """

        ds = load_dataset("json", data_files=path, split="train")
        ds = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
        #ds = dataset.map(self._tokenize, batched=True)

        # trainset = ds["train"].map(self.format_qstns)
        # testset = ds["test"].map(self.format_qstns)
        # print(trainset[0]["text"])
        # print(testset[0]["text"])
        # return trainset, testset
        return ds['train'], ds['test']

    def _callbacks(self) -> List[TrainerCallback]:
        """Returns a list of callbacks for early stopping, and custom logging."""
        callbacks = []

        if self.callbacks.early_stopping:
            callbacks.append(
                EarlyStoppingCallback(
                    early_stopping_patience=self.callbacks.early_stopping_patience,
                    early_stopping_threshold=self.callbacks.early_stopping_threshold,
                )
            )

        if self.callbacks.custom_logger:
            callbacks.append(CustomWandbCallback_FineTune())

        #callbacks.append(GenerationCallback)
        #callbacks.append(EvaluateFirstStepCallback)
        return callbacks


    def finetune(self) -> None:
        """
        Perform fine-tuning of the language model.
        """

        config_train_args = self.cfg.training_arguments
        callbacks = self._callbacks()

        # os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
        training_args = TrainingArguments(
            **config_train_args,
        )
        #response_template = " ### Response:"

        response_template_with_context = "?\n### Response:"  # We added context here: "\n". This is enough for this tokenizer
        response_template_ids = self.tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`

        data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=self.tokenizer)

        #collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=self.tokenizer)

        max_seq_length = MAX_LENGTH
        packing = False
        trainer = SFTTrainer(
            model=self.model,
            peft_config=self.peft_config,
            train_dataset=self.trainset,
            #dataset_text_field="text",
            formatting_func=self.formatting_prompts_func,
            max_seq_length=max_seq_length,
            tokenizer=self.tokenizer,
            args=training_args,
            packing=packing,
            callbacks=callbacks,
            data_collator=data_collator,
        )

        wandb.log({"Training Arguments": str(config_train_args)})
        wandb.log({"model_summary": str(self.model)})

        trainer.save_model(
            f"{self.cfg.path.finetuned_modelname}/llamav2-7b-no-fine-tune"
        )
        trainer.train()
        trainer.save_state(
            f"{self.cfg.path.finetuned_modelname}/llamav2-7b-lora-fine-tune"
        )
        trainer.save_model(
            f"{self.cfg.path.finetuned_modelname}/llamav2-7b-lora-fine-tune"
        )
        # eval_result = trainer.evaluate(eval_dataset=self.tokenized_dataset['test'])
        # wandb.log(eval_result)

        self.model.save_pretrained(f"{self.cfg.path.finetuned_modelname}/llamav2-7b-lora-save-pretrain", save_config=True)
        wandb.finish()
        return self.cfg.path.finetuned_modelname


In [None]:


IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"




In [None]:

class FinetuneLLama():
    """Class to perform finetuning of a language model.
        Initialize the FinetuneModel.

    Args:
        cfg (DictConfig): Configuration for the fine-tuning.
        local_rank (int, optional): Local rank for distributed training. Defaults to None.
    """
    def __init__(self, cfg: DictConfig,local_rank=None) -> None:
        self.local_rank = local_rank
        self.representation = cfg.model.representation
        self.cfg = cfg.model.finetune
        self.context_length: int = self.cfg.context_length
        self.callbacks = self.cfg.callbacks
        self.ckpt = self.cfg.path.pretrained_checkpoint
        self.bnb_config = self.cfg.bnb_config
        self.model, self.tokenizer = self._setup_model_tokenizer()
        self.tokenized_dataset = self._prepare_datasets(self.cfg.path.finetune_traindata)

    def _setup_model_tokenizer(self) -> None:


        llama_tokenizer = LlamaTokenizer.from_pretrained(
        self.ckpt,
        model_max_length=MAX_LENGTH,
        padding_side="right",
        use_fast=False,
        )

        if (self.bnb_config.use_4bit and self.bnb_config.use_8bit):
            raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")

        elif (self.bnb_config.use_4bit or self.bnb_config.use_8bit):
            compute_dtype = getattr(torch,  self.bnb_config.bnb_4bit_compute_dtype)
            bnb_config = BitsAndBytesConfig(
            load_in_4bit= self.bnb_config.use_4bit,
            load_in_8bit= self.bnb_config.use_8bit,
            bnb_4bit_quant_type= self.bnb_config.bnb_4bit_quant_type,
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant= self.bnb_config.use_nested_quant,
            )
        else:
            bnb_config = None

        # Check GPU compatibility with bfloat16
        if compute_dtype == torch.float16:
            major, _ = torch.cuda.get_device_capability()
            if major >= 8:
                print("=" * 80)
                print("Your GPU supports bfloat16: accelerate training with bf16=True")
                print("=" * 80)

        device_map = {"": 0}
        model = LlamaForSequenceClassification.from_pretrained(self.ckpt,
                                                            num_labels=1,
                                                            quantization_config=bnb_config,
                                                            device_map=device_map
                                                            )
        
        lora_config = LoraConfig(
            **self.cfg.lora_config
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
        

        special_tokens_dict = dict()
        if llama_tokenizer.pad_token is None:
            special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
        if llama_tokenizer.eos_token is None:
            special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
        if llama_tokenizer.bos_token is None:
            special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
        if llama_tokenizer.unk_token is None:
            special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

        smart_tokenizer_and_embedding_resize(
            special_tokens_dict=special_tokens_dict,
            llama_tokenizer=llama_tokenizer,
            model=model,
        )

        print(len(llama_tokenizer))
        return model, llama_tokenizer
    
    def _tokenize(self, examples):
        tokenized_examples = self.tokenizer(
            examples[self.representation], truncation=True, padding=True, return_tensors="pt"
        )
        return tokenized_examples
    

    def _prepare_datasets(self, path: str) -> DatasetDict:
        """
        Prepare training and validation datasets.

        Args:
            train_df (pd.DataFrame): DataFrame containing training data.

        Returns:
            DatasetDict: Dictionary containing training and validation datasets.
        """

        ds = load_dataset("json", data_files=path,split="train")
        dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
        return dataset.map(self._tokenize, batched=True)

    def _callbacks(self) -> List[TrainerCallback]:
        """Returns a list of callbacks for early stopping, and custom logging."""
        callbacks = []

        if self.callbacks.early_stopping:
            callbacks.append(EarlyStoppingCallback(
                early_stopping_patience=self.callbacks.early_stopping_patience,
                early_stopping_threshold=self.callbacks.early_stopping_threshold
            ))

        if self.callbacks.custom_logger:
            callbacks.append(CustomWandbCallback_FineTune())

        callbacks.append(EvaluateFirstStepCallback)

        return callbacks

    def _compute_metrics(self, p: Any, eval=True) -> Dict[str, float]:
        preds = torch.tensor(p.predictions.squeeze())  # Convert predictions to PyTorch tensor
        label_ids = torch.tensor(p.label_ids)  # Convert label_ids to PyTorch tensor

        if eval:
            # Calculate RMSE as evaluation metric
            eval_rmse = torch.sqrt(((preds - label_ids) ** 2).mean()).item()
            return {"eval_rmse": round(eval_rmse, 3)}
        else:
            # Calculate RMSE as training metric
            loss = torch.sqrt(((preds - label_ids) ** 2).mean()).item()
            return {"train_rmse": round(loss, 3), "loss": round(loss, 3)}
        
    


    def finetune(self) -> None:
        """
        Perform fine-tuning of the language model.
        """

        config_train_args = self.cfg.training_arguments
        callbacks = self._callbacks()


        #os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
        training_args = TrainingArguments(
            **config_train_args,
            metric_for_best_model="eval_rmse",  # Metric to use for determining the best model
            greater_is_better=False,  # Lower eval_rmse is better

        )
    
        trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=None,
            compute_metrics=self._compute_metrics,
            tokenizer=self.tokenizer,
            train_dataset=self.tokenized_dataset['train'],
            eval_dataset=self.tokenized_dataset['test'],
            callbacks=callbacks,
        )

        wandb.log({"Training Arguments": str(config_train_args)})
        wandb.log({"model_summary": str(self.model)})

        trainer.train()
        trainer.save_model(f"{self.cfg.path.finetuned_modelname}/llamav2-7b-lora-fine-tune")

        eval_result = trainer.evaluate(eval_dataset=self.tokenized_dataset['test'])
        wandb.log(eval_result)

        self.model.save_pretrained(self.cfg.path.finetuned_modelname)
        wandb.finish()
        return self.cfg.path.finetuned_modelname

  