In [1]:

import os
import json
from functools import partial
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import torch
import wandb
from datasets import DatasetDict, load_dataset
from omegaconf import DictConfig
from peft import (
    LoraConfig,
    get_peft_model,
)
from torch import nn
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
)

from mattext.models.utils import (
    CustomWandbCallback_FineTune,
    EvaluateFirstStepCallback,
)

from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoModelForCausalLM,AutoTokenizer
import torch
from transformers import pipeline, logging


2024-05-17 11:15:06.677808: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-17 11:15:06.678064: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-17 11:15:06.740531: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-17 11:15:06.875682: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

In [3]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict,
    llama_tokenizer,
    model,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = llama_tokenizer.add_special_tokens(special_tokens_dict)
    llama_tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(llama_tokenizer), pad_to_multiple_of=8)

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
            dim=0, keepdim=True
        )
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg

    model.config.pad_token_id = llama_tokenizer.pad_token_id
    output_embeddings[-num_new_tokens:] = output_embeddings_avg



In [4]:
def _setup_model():
    pretrained_ckpt = "meta-llama/Llama-2-7b-hf"
    base_model = AutoModelForCausalLM.from_pretrained(
        pretrained_ckpt,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt, trust_remote_code=True)
    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.bos_token is None:
        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    print(special_tokens_dict)

    smart_tokenizer_and_embedding_resize(
                special_tokens_dict=special_tokens_dict,
                llama_tokenizer=tokenizer,
                model=base_model,
            )
    return base_model, tokenizer


In [66]:
from datasets import DatasetDict, load_dataset
representation ="slice"

PROPERTY_MAP : {
    "matbench_log_gvrh" : "shear modulus (in GPa)",
    "matbench_log_kvrh" : "bulk modulus (in GPa)",
    "matbench_dielectric" : "refractive index",
    "matbench_perovskites" : "formation energy (in eV)",}

MATERIAL_MAP : {
    "matbench_log_gvrh" : "material",
    "matbench_log_kvrh" : "material",
    "matbench_dielectric" : "dielectric material",
    "matbench_perovskites" : "perovskite material",   }

representation = "slice"
property_ = "bulk modulus (in GPa)"
material_ = "material"


def format_qstns(sample):
        instruct = f"""### Instruction:
Below is a {material_} represented as string. Followed by a question. Write a response to the question.\n"""
        material = f"{sample[representation]}\n"
        question = f"""The question is ### Question: What is the {property_} of this material? \n"""
        response = f"""The response is ### Response:"""
        # join all the parts together
        return "".join([i for i in [instruct, material,question, response] if i is not None])

def template_dataset(sample):
    sample["text"] = f"{format_qstns(sample)}"
    return sample

def _prepare_datasets(path: str) -> DatasetDict:
    """
    Prepare training and validation datasets.

    Args:
        train_df (pd.DataFrame): DataFrame containing training data.

    Returns:
        DatasetDict: Dictionary containing training and validation datasets.
    """

    ds = load_dataset("json", data_files=path, split="train")
    dataset = ds.map(template_dataset, remove_columns=list(ds.features))
    return dataset

In [67]:
testset = _prepare_datasets("/work/so87pot/material_db/all_1/test_matbench_log_kvrh_0.json")

Map:   0%|          | 0/1528 [00:00<?, ? examples/s]

In [9]:
def get_model_for_eval(adapter, batch_size = 1):
    base_model, tokenizer = _setup_model()
    model = PeftModel(base_model, adapter)
    model = model.merge_and_unload()

    logging.set_verbosity(logging.CRITICAL)

    # Initialize the pipeline
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    batch_size=batch_size,
                    max_length=1024,
                    do_sample=True,
                    temperature=0.1)

    return pipe

In [63]:
adapter_path = "/work/so87pot/mattext/megaloop2/finetune/llama_collator/checkpoints/train_slice_matbench_log_kvrh_0/checkpoint-3800"
pipe = get_model_for_eval(adapter_path, batch_size=8)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'pad_token': '[PAD]'}


AttributeError: 'str' object has no attribute 'peft_type'

In [64]:
base_model, tokenizer = _setup_model()
    


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'pad_token': '[PAD]'}


In [70]:
adapter_path = "/work/so87pot/mattext/megaloop2/finetune/llama_collator/checkpoints/train_composition_matbench_log_kvrh_0/checkpoint-3775/"
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.merge_and_unload()

logging.set_verbosity(logging.CRITICAL)

# Initialize the pipeline
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                batch_size=8,
                max_length=2048,
                do_sample=True,
                temperature=0.1)

In [68]:
testset['text']

['### Instruction:\nBelow is a material represented as string. Followed by a question. Write a response to the question.\nTi Al 0 1 o o o 0 1 o o + 0 1 o + o 0 1 o + + 0 1 + o o 0 1 + o + 0 1 + + o 0 1 + + + 0 0 + o o 0 0 o + o 1 1 + o o 1 1 o + o \nThe question is ### Question: What is the bulk modulus (in GPa) of this material? \nThe response is ### Response:',
 '### Instruction:\nBelow is a material represented as string. Followed by a question. Write a response to the question.\nMg Mg S S O O O O O O O O 0 7 - - - 0 5 - - o 0 10 - o o 0 8 o - - 0 4 o o - 0 6 o o o 1 5 - - o 1 7 - - o 1 9 - o o 1 11 o - o 1 6 o o o 1 4 o o o 2 11 o o o 2 8 o o o 2 4 o o o 2 7 o o o 3 6 o o o 3 5 o o o 3 10 o o o 3 9 o o o \nThe question is ### Question: What is the bulk modulus (in GPa) of this material? \nThe response is ### Response:',
 '### Instruction:\nBelow is a material represented as string. Followed by a question. Write a response to the question.\nCu Cu B B Se Se Se Se 0 4 - - o 0 6 - o - 

In [26]:
pipe([testset['text'][i] for i in range(25)] )

[[{'generated_text': '### Instruction:\nBelow is a material represented as string. Followed by a question. Write a response to the question.\nAlTi\nThe question is ### Question: What is the bulk modulus (in GPa) of this material? \nThe response is ### Response:.6manesix1.49612964444544564.696644564.696664456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.696664

In [57]:
resp = pipe(testset['text'][0:16])

In [58]:
resp

[[{'generated_text': '### Instruction:\nBelow is a material represented as string. Followed by a question. Write a response to the question.\nAlTi\nThe question is ### Question: What is the bulk modulus (in GPa) of this material? \nThe response is ### Response:.6manesix1.49612964444544564.696644564.696664456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.69666456.696664

In [71]:
responses_dict = {}
resp = pipe(testset['text'])
for j, (prompt, responses) in enumerate(zip(testset['text'], resp)):
    #print(prompt)
    generated_text = responses[0]['generated_text']
    # Extract the response part by removing the prompt part
    complete_response = generated_text.replace(prompt, '').strip()
    parsed_answer = complete_response.replace("The response is ### Response:", '').strip()
    print(parsed_answer)
    responses_dict[j] = {"prompt": prompt, "response": responses[0]['generated_text'], "parsed_answer":parsed_answer }
    
with open("test_prompt_response_slice.json", "w") as f:
    json.dump(responses_dict, f, indent=4)    

.61318130.1964441 1.1964441 1.5964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.9964441 1.

In [43]:
for prompt, res in ()
with open("test_prompt_response.json", "w") as f:
    json.dump(resp, f, indent=4)

SyntaxError: invalid syntax (592126758.py, line 1)

In [None]:



responses = {}

for i, prompt in enumerate(testset['text']):
    resp = pipe(prompt)
    responses[i] = {"prompt": prompt, "response": resp[0]['generated_text']}
    print(resp[0]['generated_text'])

# Save the prompts and responses to a JSON file
with open("prompt_response.json", "w") as f:
    json.dump(responses, f, indent=4)

