In [1]:

import os
import numpy as np
import pandas as pd


from torch.utils.data import Dataset
from peft import (
    LoraConfig, 
    get_peft_model, 
)

from functools import partial
from typing import Any, Dict, List

import torch
import wandb
from datasets import DatasetDict, load_dataset
from omegaconf import DictConfig

from torch import nn

from transformers import (
    LlamaTokenizer, 
    LlamaForSequenceClassification,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback,
    TrainerCallback,
)

from structllm.models.utils import CustomWandbCallback_FineTune, EvaluateFirstStepCallback


2024-05-16 09:19:45.586860: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-16 09:19:45.587053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-16 09:19:45.634538: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-16 09:19:45.743938: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [1]:

final_try = "/work/so87pot/structllm/megaloop2/finetune/demo/comp_perov/finetuned_train_composition_matbench_log_gvrh_0/llamav2-7b-lora-fine-tune"


In [2]:
IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def smart_tokenizer_and_embedding_resize(
    special_tokens_dict, 
    llama_tokenizer, 
    model,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = llama_tokenizer.add_special_tokens(special_tokens_dict)
    llama_tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(llama_tokenizer),pad_to_multiple_of=8)

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        input_embeddings[-num_new_tokens:] = input_embeddings_avg

    model.config.pad_token_id = llama_tokenizer.pad_token_id

In [8]:
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoModelForCausalLM,AutoTokenizer
import torch

pretrained_ckpt = "meta-llama/Llama-2-7b-hf"
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_ckpt,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt, trust_remote_code=True)
special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

print(special_tokens_dict)

smart_tokenizer_and_embedding_resize(
            special_tokens_dict=special_tokens_dict,
            llama_tokenizer=tokenizer,
            model=base_model,
        )



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'pad_token': '[PAD]'}


In [10]:
#model = PeftModel.from_pretrained(model, final_try, device_map="auto")

model = PeftModel.from_pretrained(base_model, final_try) # as in crytal-text-llm
#model = model.merge_and_unload()





In [11]:
from transformers import pipeline, logging

2024-05-15 11:22:51.086482: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 11:22:51.086691: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 11:22:51.134161: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-15 11:22:51.232149: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:

# Suppress Hugging Face pipeline logging
logging.set_verbosity(logging.CRITICAL)

# Initialize the pipeline
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                max_length=1024,
                do_sample=True,
                temperature=0.9)

In [14]:
from datasets import DatasetDict, load_dataset
representation ="composition"

PROPERTY_MAP : {
    "matbench_log_gvrh" : "shear modulus (in GPa)",
    "matbench_log_kvrh" : "bulk modulus (in GPa)",
    "matbench_dielectric" : "refractive index",
    "matbench_perovskites" : "formation energy (in eV)",}

MATERIAL_MAP : {
    "matbench_log_gvrh" : "material",
    "matbench_log_kvrh" : "material",
    "matbench_dielectric" : "dielectric material",
    "matbench_perovskites" : "perovskite material",   }


def _tokenize( examples):
        tokenized_examples = tokenizer(
            examples[representation],
            truncation=True,
            padding=True,
            return_tensors="pt",
        )
        return tokenized_examples

def format_qstns( sample):
    material_ = "material"
    property_ = "shear modulus (in GPa)"
    question = f"<s>Question: What is the {property_} of the {material_} "
    material = f"{sample[representation]}"
    response = f" Answer:{tokenizer.eos_token}"
    # join all the parts together
    prompt = "".join([i for i in [question, material, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset_test( sample):
    sample["text"] = f"{format_qstns(sample)}"
    return sample

def _prepare_datasets( path: str) -> DatasetDict:
    """
    Prepare training and validation datasets.

    Args:
        train_df (pd.DataFrame): DataFrame containing training data.

    Returns:
        DatasetDict: Dictionary containing training and validation datasets.
    """

    ds = load_dataset("json", data_files=path, split="train")
    dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
    ds = dataset.map(_tokenize, batched=True)

    trainset = ds["train"].map(template_dataset_test)
    testset = ds["test"].map(template_dataset_test)
    print(trainset[0]["text"])
    print(testset[0]["text"])
    return trainset, testset

In [15]:
_,testset = _prepare_datasets("/work/so87pot/material_db/all_1/train_matbench_log_gvrh_0.json")

Map:   0%|          | 0/4858 [00:00<?, ? examples/s]

Map:   0%|          | 0/4858 [00:00<?, ? examples/s]

Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

<s>Question: What is the shear modulus (in GPa) of the material CoO2Zn Answer:</s>
<s>Question: What is the shear modulus (in GPa) of the material Pd3Ta Answer:</s>


In [57]:
testset['text'][11]

'### Instruction:\n# Below is a material represented as string. Followed by a question. Write a response to the question.\n5.0 5.0 3.0\n90 90 120\nTb\n0.00 0.00 0.00\nCo\n0.50 0.00 0.50\nCo\n0.00 0.50 0.50\nCo\n0.50 0.50 0.50\nB\n0.33 0.67 0.00\nB\n0.67 0.33 0.00\nThe question is ### Question:\nWhat is the bulk modulus (in GPa) of this material? \nThe response is ### Response:\n2.32'

In [17]:
result = pipe(testset['text'][39][:-4])

In [18]:
result

[{'generated_text': '<s>Question: What is the shear modulus (in GPa) of the material Na2O4U Answer: 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 5.95 

In [19]:
for i in range(25):
 print(pipe(testset['text'][i][:-4]))

[{'generated_text': '<s>Question: What is the shear modulus (in GPa) of the material Pd3Ta Answer: 5.70 Explanation: The shear modulus of Pd3Ta is 5.70 GPa.\nExplanation: The shear modulus of Pd3Ta is 5.70 GPa.\nWhat is the shear modulus (in GPa) of the material Pd3Ta Answer?\nThe shear modulus of Pd3Ta is 5.70 GPa.\nWhat is the shear modulus (in GPa) of the material Pd3Ta?\nThe shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa.\nThe shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa.\nWhat is the shear modulus (in GPa) of the material Pd3Ta?\nThe shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa.\nWhat is the shear modulus (in GPa) of the material Pd3Ta? The shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa. The shear modulus of Pd3Ta is 5.70 GPa.\nWhat is the shear modulus (in GPa) of the material Pd3Ta? The



[{'generated_text': '<s>Question: What is the shear modulus (in GPa) of the material C16 Answer: 29.957546666667\nThe shear modulus of the material C16 is 29.957546666667 GPa.\nThis result is obtained by inserting the material data from the following table into the formula of the shear modulus.\nT = 0 °C\nγ = 10-6 Pa\nρ = 2700 kg/m³\nE = 210 GPa\nν = 0.3\nk = 29.957546666667 GPa\nYou can find the material data for the material C16 in the data sheet.\nWhat is the shear modulus of the material C16?\nWhat is the shear modulus (in GPa) of the material C16?\nWhat is the shear modulus of the material C16?\nWhat is the shear modulus of the material C16?\nThe shear modulus of the material C16 is 29.957546666667 GPa.\nThe shear modulus of the material C16 is 29.957546666667 GPa. This is the value for the shear modulus, which was calculated based on the material data.\nWhat is the shear modulus of the material C16? – Explanation\nShear modulus formula\nThe shear modulus, symbolized by G, is a me

KeyboardInterrupt: 

In [20]:

pipe = pipeline(task="text-generation",
                model=base_model,
                tokenizer=tokenizer,
                max_length=1024,
                do_sample=True,
                temperature=0.9)



In [23]:
for i in range(5):
 print(pipe(testset['text'][i][:-4]))

[{'generated_text': '<s>Question: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta Answer: What is the shear modulus (in GPa) of the material Pd3Ta

In [29]:
adapter = "/work/so87pot/structllm/megaloop2/finetune/llama_instruct/checkpoints/finetuned_train_composition_matbench_log_gvrh_0/llamav2-7b-lora-fine-tune"
model = PeftModel.from_pretrained(base_model, adapter)
model = model.merge_and_unload()

In [30]:

pipe = pipeline(task="text-generation",
                model=base_model,
                tokenizer=tokenizer,
                max_length=1024,
                do_sample=True,
                temperature=0.1)

for i in range(5):
 print(pipe(testset['text'][i][:-4]))



[{'generated_text': '<s>Question: What is the shear modulus (in GPa) of the material Pd3Ta Answer: 10.1111/j.1467-8113.2011.01485.x.\nThe shear modulus of a material is the measure of its stiffness. It is defined as the ratio of shear stress to shear strain. The shear modulus is a measure of the stiffness of a material. It is defined as the ratio of shear stress to shear strain.\nThe shear modulus of a material is the measure of its stiffness. It is defined as the ratio of shear stress to shear strain. The shear modulus is a measure of the stiffness of a material. It is defined as the ratio of shear stress to shear strain.\nThe shear modulus of a material is the measure of its stiffness. It is defined as the ratio of shear stress to shear strain. The shear modulus is a measure of the stiffness of a material. It is defined as the ratio of shear stress to shear strain.\nThe shear modulus of a material is the measure of its stiffness. It is defined as the ratio of shear stress to shear st

In [31]:
adapter = "/work/so87pot/structllm/megaloop2/finetune/llama_instruct/checkpoints/finetuned_train_composition_matbench_log_gvrh_0/llamav2-7b-lora-fine-tune"
model = PeftModel.from_pretrained(base_model, adapter)
#model = model.merge_and_unload()

# pipe = pipeline(task="text-generation",
#                 model=base_model,
#                 tokenizer=tokenizer,
#                 max_length=1024,
#                 do_sample=True,
#                 temperature=0.1)

# for i in range(5):
#  print(pipe(testset['text'][i][:-4]))

In [32]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32008, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            

In [33]:
model = model.merge_and_unload()
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32008, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

In [32]:
# from peft import LoraConfig, PeftModel, get_peft_model

# pretrained_ckpt = "meta-llama/Llama-2-7b-hf"
# base_model = AutoModelForCausalLM.from_pretrained(
#     pretrained_ckpt,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )

# tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt, trust_remote_code=True)
# special_tokens_dict = dict()
# if tokenizer.pad_token is None:
#     special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
# if tokenizer.eos_token is None:
#     special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
# if tokenizer.bos_token is None:
#     special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
# if tokenizer.unk_token is None:
#     special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

# print(special_tokens_dict)

# smart_tokenizer_and_embedding_resize(
#             special_tokens_dict=special_tokens_dict,
#             llama_tokenizer=tokenizer,
#             model=base_model,
#         )


model = PeftModel.from_pretrained(base_model, comp_path)
model = model.merge_and_unload()


In [33]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32008, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Lin

model

In [60]:
representation ="composition"

PROPERTY_MAP : {
    "matbench_log_gvrh" : "shear modulus (in GPa)",
    "matbench_log_kvrh" : "bulk modulus (in GPa)",
    "matbench_dielectric" : "refractive index",
    "matbench_perovskites" : "formation energy (in eV)",}

MATERIAL_MAP : {
    "matbench_log_gvrh" : "material",
    "matbench_log_kvrh" : "material",
    "matbench_dielectric" : "dielectric material",
    "matbench_perovskites" : "perovskite material",   }


def _tokenize( examples):
        tokenized_examples = tokenizer(
            examples[representation],
            truncation=True,
            padding=True,
            return_tensors="pt",
        )
        return tokenized_examples

def format_qstns( sample):
    material_ = "material"
    property_ = "bulk modulus (in GPa)"
    question = f"<s>Question: What is the {property_} of the {material_} "
    material = f"{sample[representation]}"
    response = f" Answer:"
    # join all the parts together
    prompt = "".join([i for i in [question, material, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset_test( sample):
    sample["text"] = f"{format_qstns(sample)}"
    return sample

def _prepare_datasets( path: str) -> DatasetDict:
    """
    Prepare training and validation datasets.

    Args:
        train_df (pd.DataFrame): DataFrame containing training data.

    Returns:
        DatasetDict: Dictionary containing training and validation datasets.
    """

    ds = load_dataset("json", data_files=path, split="train")
    dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
    ds = dataset.map(_tokenize, batched=True)

    trainset = ds["train"].map(template_dataset_test)
    testset = ds["test"].map(template_dataset_test)
    print(trainset[0]["text"])
    print(testset[0]["text"])
    return trainset, testset

In [35]:
_,testset = _prepare_datasets("/work/so87pot/material_db/all_1/test_matbench_log_gvrh_0.json")

Map:   0%|          | 0/1222 [00:00<?, ? examples/s]

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1222 [00:00<?, ? examples/s]

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

<s>Question: What is the bulk modulus (in GPa) of the material Hf12N4Zn4 Answer:
<s>Question: What is the bulk modulus (in GPa) of the material CdCl3Cs Answer:


In [36]:

# Suppress Hugging Face pipeline logging
logging.set_verbosity(logging.CRITICAL)

# Initialize the pipeline
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                max_length=512,
                do_sample=True,
                temperature=1)

In [59]:
pipe(testset['text'][110])

[{'generated_text': '<s>Question: What is the bulk modulus (in GPa) of the material Be Pd Pd 0 1 - - - 0 1 - o - 0 1 o - - 0 1 o o - 0 2 - - o 0 2 - o o 0 2 o - o 0 2 o o o 0 0 o + o 0 0 + o o 1 2 - - o 1 2 - o o 1 2 o - o 1 2 o o o 1 2 o o + 1 1 o + o 1 1 + o o 2 2 o + o 2 2 + o o  Answer: 1) 0 1 - o - 2) 2 2 o + o 3) 0 1 o - - 4) 1 1 o + o 5) 2 2 o o o 6) 0 0 o o o 7) 1 1 o o o 8) 2 2 o o o 9) 0 0 o o o 10) 2 2 o o o 11) 0 1 o o o 12) 2 2 + o o 13) 1 1 + o o 14) 2 2 + o o 15) 0 0 o + + 16) 1 1 + o o 17) 2 2 + o o 18) 0 1 o o - 19) 2 2 + o o 20) 1 1 + o o 21) 2 2 + o o 22) 0 1 o o - 23) 2 2 + o - 24) 1 1 + o o 25) 2 2 + o o 26) 0 1 o o o 27) 1 1 + + o 28) 2 2 + o o 29) 1 1 + + o 30) 2 2 + o o 31) 1 1 + + o 32) 2 2 + o o 33) 1 1 + o o'}]

In [None]:
def load_model(model_name):
    # Load tokenizer and model with QLoRA configuration
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )

    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Load LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer, peft_config

In [None]:
model, tokenizer = load_model(full_path)

In [85]:
def format_qstns( sample):
    material_ = "material"
    property_ = "bulk modulus (in GPa)"
    instruct = f"<s>[INST] Below is a material represented as string. Followed by a question. Write a response to the question.\n"
    material = f"{sample[representation]}\n"
    question = f"Question: What is the {property_} of the {material_}?\n"
    response = f" Answer: The {property_} of the {material_} is {round(float(sample['labels']),3)}"
    # join all the parts together
    prompt = "".join([i for i in [instruct, material,question, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset_test( sample):
    sample["text"] = f"{format_qstns(sample)}{tokenizer.eos_token}"
    return sample

def _prepare_datasets( path: str) -> DatasetDict:
    """
    Prepare training and validation datasets.

    Args:
        train_df (pd.DataFrame): DataFrame containing training data.

    Returns:
        DatasetDict: Dictionary containing training and validation datasets.
    """

    ds = load_dataset("json", data_files=path, split="train")
    dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
    ds = dataset.map(_tokenize, batched=True)

    trainset = ds["train"].map(template_dataset_test)
    testset = ds["test"].map(template_dataset_test)
    print(trainset[0]["text"])
    print(testset[0]["text"])
    return trainset, testset

In [86]:
_,testset = _prepare_datasets("/work/so87pot/material_db/all_1/train_matbench_log_gvrh_0.json")

Map:   0%|          | 0/4858 [00:00<?, ? examples/s]

Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

<s>[INST] Below is a material represented as string. Followed by a question. Write a response to the question.
CoO2Zn
Question: What is the bulk modulus (in GPa) of the material?
 Answer: The bulk modulus (in GPa) of the material is 1.813</s>
<s>[INST] Below is a material represented as string. Followed by a question. Write a response to the question.
Pd3Ta
Question: What is the bulk modulus (in GPa) of the material?
 Answer: The bulk modulus (in GPa) of the material is 1.949</s>


In [88]:
testset['text'][0][0:179]

'<s>[INST] Below is a material represented as string. Followed by a question. Write a response to the question.\nPd3Ta\nQuestion: What is the bulk modulus (in GPa) of the material?\n '

In [89]:
pipe(testset['text'][0][0:179])

[{'generated_text': "<s>[INST] Below is a material represented as string. Followed by a question. Write a response to the question.\nPd3Ta\nQuestion: What is the bulk modulus (in GPa) of the material?\n  The material has a bulk modulus of 110 GPa.\nQuestion: What is the elastic compliance (in GPa) of the material?\n  The material has an elastic compliance of 50 GPa.\nQuestion: What is the modulus of resilience (in GPa) of the material?\n  The material has a modulus of resilience of 65 GPa.\nQuestion: What is the stress strain coefficient (in GPa) of the material?\n  The material has a stress strain coefficient of 125 GPa.\nQuestion: What is the thermal expansion coefficient (in GPa) of the material?\n  The material has a thermal expansion coefficient of 10 GPa.\nQuestion: What is the fracture toughness (in GPa) of the material?\n  The material has a fracture toughness of 75 GPa.\nQuestion: What is the Young's modulus (in GPa) of the material?\n  The material has a Young's modulus of 10