## Evaluate

In [3]:
import contextlib
from functools import partial
from typing import Any, Dict, List

import json
from omegaconf import OmegaConf
from datasets import DatasetDict, load_dataset
from torch import nn
from transformers import (
    AutoModelForSequenceClassification,
    EarlyStoppingCallback,
    Trainer,
    TrainerCallback,
    TrainingArguments,
)

from structllm.models.utils import (
    CustomWandbCallback_FineTune,
    EvaluateFirstStepCallback,
    TokenizerMixin,
)


2024-05-12 12:21:01.861068: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-12 12:21:01.861202: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-12 12:21:01.991361: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-12 12:21:02.348947: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
class PotentialModel(TokenizerMixin):
    """Class to perform finetuning of a language model.
        Initialize the FinetuneModel.

    Args:
        cfg (DictConfig): Configuration for the fine-tuning.
        local_rank (int, optional): Local rank for distributed training. Defaults to None.
    """

    def __init__(self, cfg, local_rank=None) -> None:
        super().__init__(
            cfg=cfg.representation,
            special_tokens=cfg.special_tokens,
            special_num_token=cfg.special_num_token,
        )
        self.local_rank = local_rank
        self.config = cfg
        self.representation = cfg.representation
        self.alpha = cfg.alpha
        self.test_data = cfg.test_dataset
        self.context_length: int = cfg.context_length
        self.tokenized_testset = self._prepare_datasets(self.test_data, split="test")

    def _prepare_datasets(self, path: str, split) -> DatasetDict:
        """
        Prepare training and validation datasets.

        Args:
            train_df (pd.DataFrame): DataFrame containing training data.

        Returns:
            DatasetDict: Dictionary containing training and validation datasets.
        """

        ds = load_dataset("json", data_files=path, split="train")
        with contextlib.suppress(KeyError):
            ds = ds.remove_columns("labels")
        labal_name = f"total_energy_alpha_{self.alpha}"
        ds = ds.rename_column(labal_name, "labels")

        if split == "train":
            dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
        else:
            dataset = ds
        # dataset= dataset.filter(lambda example: example[self.representation] is not None)
        return dataset.map(
            partial(
                self._tokenize_pad_and_truncate, context_length=self.context_length
            ),
            batched=True,
        )

    def _callbacks(self) -> List[TrainerCallback]:
        """Returns a list of callbacks for early stopping, and custom logging."""
        callbacks = []

        if self.callbacks.early_stopping:
            callbacks.append(
                EarlyStoppingCallback(
                    early_stopping_patience=self.callbacks.early_stopping_patience,
                    early_stopping_threshold=self.callbacks.early_stopping_threshold,
                )
            )

        if self.callbacks.custom_logger:
            callbacks.append(CustomWandbCallback_FineTune())

        callbacks.append(EvaluateFirstStepCallback)

        return callbacks

    def _compute_metrics(self, p: Any, eval=True) -> Dict[str, float]:
        preds = torch.tensor(
            p.predictions.squeeze()
        )  # Convert predictions to PyTorch tensor
        label_ids = torch.tensor(p.label_ids)  # Convert label_ids to PyTorch tensor

        if eval:
            # Calculate RMSE as evaluation metric
            eval_rmse = torch.sqrt(((preds - label_ids) ** 2).mean()).item()
            return {"eval_rmse": round(eval_rmse, 3)}
        else:
            # Calculate RMSE as training metric
            loss = torch.sqrt(((preds - label_ids) ** 2).mean()).item()
            return {"train_rmse": round(loss, 3), "loss": round(loss, 3)}

    def finetune(self) -> None:
        """
        Perform fine-tuning of the language model.
        """

        pretrained_ckpt = self.config.checkpoint


        model = AutoModelForSequenceClassification.from_pretrained(
            pretrained_ckpt,
            num_labels=1,
            ignore_mismatched_sizes=False
        )
        if self.local_rank is not None:
            model = model.to(self.local_rank)
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[self.local_rank]
            )
        else:
            model = model.to("cuda")

        trainer = Trainer(
            model=model,
            data_collator=None,
            #callbacks = callbacks
        )

        eval_result = trainer.evaluate(eval_dataset=self.tokenized_testset)
        #predictions = trainer.predict(self.tokenized_testset)
        print(eval_result)
        return eval_result
        #print(predictions)

        # config_train_args = self.cfg.training_arguments
        # callbacks = self._callbacks()

        # training_args = TrainingArguments(
        #     **config_train_args,
        #     metric_for_best_model="eval_rmse",  # Metric to use for determining the best model
        #     greater_is_better=False,  # Lower eval_rmse is better
        # )
    

In [1]:
from omegaconf import OmegaConf
properties = [ "matbench_dielectric" , "matbench_log_kvrh", "matbench_perovskites" ]
alphas = [0,0.2,0.4,0.5,0.6,0.8,1]
ckpt_map = {
    0 : "0",
    0.2 : "0_2",
    0.4 : "0_4",
    0.5 : "0_5",
    0.6 : "0_6",
    0.8 : "0_8",
    1 : "1",
}


prop_alpha_result = {}
for prop in properties:
    prop_dict = {}
    for alpha in alphas:
        alpha_string = ckpt_map[alpha]
        test_dataset = f"/work/so87pot/material_db/lp/test_{prop}_0.json"
        checkpoint = f"/work/so87pot/structllm/megaloop/finetune/potential_{alpha_string}/checkpoints/finetuned_train_crystal_llm_rep_{prop}_0"
        print(test_dataset,checkpoint)

        config = {
        "representation": "crystal_llm_rep",
        "context_length": 512,
        "special_num_token": False,
        "alpha": alpha,
        "dataset_name": "matbench_perovskites",
        "test_dataset": test_dataset,
        "checkpoint": checkpoint,
        "special_tokens": {
            "unk_token": "[UNK]",
            "pad_token": "[PAD]",
            "cls_token": "[CLS]",
            "sep_token": "[SEP]",
            "mask_token": "[MASK]",
            "eos_token": "[EOS]",
            "bos_token": "[BOS]"}
        }

        conf = OmegaConf.create(config)
        pot = PotentialModel(conf)
        res = pot.finetune()

        prop_dict[alpha] = res

    key_name = f"{prop}"
    prop_alpha_result[key_name] = prop_dict
#dump results as json

with open('prop_alpha_result_2.json', 'w') as fp:
    json.dump(prop_alpha_result, fp)





/work/so87pot/material_db/lp/test_matbench_dielectric_0.json /work/so87pot/structllm/megaloop/finetune/potential_0/checkpoints/finetuned_train_crystal_llm_rep_matbench_dielectric_0


NameError: name 'OmegaConf' is not defined

In [None]:

properties = [ "matbench_dielectric" , "matbench_log_kvrh", "matbench_perovskites" ]
alphas = [0,0.2,0.4,0.5,0.6,0.8,1]
ckpt_map = {
    0 : "0",
    0.2 : "0_2",
    0.4 : "0_4",
    0.5 : "0_5",
    0.6 : "0_6",
    0.8 : "0_8",
    1 : "1",
}


prop_alpha_result = {}
for prop in properties:
    for alpha in alphas:
        alpha_string = ckpt_map[alpha]
        test_dataset = f"/work/so87pot/material_db/lp/test_{prop}_0.json"
        checkpoint = f"/work/so87pot/structllm/megaloop/finetune/potential_{alpha_string}/checkpoints/finetuned_train_crystal_llm_rep_{prop}_0"
        print(test_dataset,checkpoint)

        config = {
        "representation": "crystal_llm_rep",
        "context_length": 512,
        "special_num_token": False,
        "alpha": alpha,
        "dataset_name": "matbench_perovskites",
        "test_dataset": test_dataset,
        "checkpoint": checkpoint,
        "special_tokens": {
            "unk_token": "[UNK]",
            "pad_token": "[PAD]",
            "cls_token": "[CLS]",
            "sep_token": "[SEP]",
            "mask_token": "[MASK]",
            "eos_token": "[EOS]",
            "bos_token": "[BOS]"}
        }

        conf = OmegaConf.create(config)
        pot = PotentialModel(conf)
        res = pot.finetune()

        key_name = f"{prop}_{alpha}"
        prop_alpha_result[key_name] = res
#dump results as json

with open('prop_alpha_result.json', 'w') as fp:
    json.dump(prop_alpha_result, fp)





# Load Dataset

In [None]:
from datasets import load_dataset


In [None]:
#path= "/work/so87pot/material_db/qmof_text/qmof_filtered_text.json"
path="/work/so87pot/material_db/lp/test_matbench_dielectric_0.json"
ds = load_dataset("json", data_files=path,split="train")

In [None]:
ds.remove_columns('labels')
ds.rename_column('total_energy_alpha_0', 'labels')

Dataset({
    features: ['mbid', 'composition_energy', 'geometry_energy', 'labels', 'crystal_llm_rep', 'composition_energy_normalized', 'geometry_energy_normalized', 'label', 'total_energy_alpha_0.2', 'total_energy_alpha_0.4', 'total_energy_alpha_0.5', 'total_energy_alpha_0.6', 'total_energy_alpha_0.8', 'total_energy_alpha_1'],
    num_rows: 6069
})

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datasets import Dataset


#path= "/work/so87pot/material_db/qmof_text/qmof_filtered_text.json"
path="/work/so87pot/material_db/lp_dataset/test_matbench_dielectric_0.json"
dataset = load_dataset("json", data_files=path,split="train")

# Assuming you have your dataset stored in a variable called 'dataset'

# Step 1: Normalize the features
scaler = MinMaxScaler()
composition_energy_reshaped = np.array(dataset['composition_energy']).reshape(-1, 1)
geometry_energy_reshaped = np.array(dataset['geometry_energy']).reshape(-1, 1)
normalized_features = scaler.fit_transform(composition_energy_reshaped)
normalized_geometry = scaler.fit_transform(geometry_energy_reshaped)

# Convert normalized features and geometry to lists
normalized_features_list = normalized_features
normalized_geometry_list = normalized_geometry

# Create a new dataset with the normalized features and geometry
# Convert new_dataset to a dictionary
new_dataset_dict = new_dataset.to_dict()

alphas = [0,0.2,0.4,0.5,0.6,0.8,1]
# Make the necessary modifications
for alpha in alphas:
    total_energy = alpha * np.array(new_dataset_dict['composition_energy_normalized']) + (1 - alpha) * np.array(new_dataset_dict['geometry_energy_normalized'])
    new_dataset_dict[f'total_energy_alpha_{alpha}'] = total_energy

# Convert the modified dictionary back to a Dataset object
new_dataset = Dataset.from_dict(new_dataset_dict)
new_dataset.to_json("/work/so87pot/material_db/lp/test_matbench_dielectric_0.json")


Creating json from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

4035018

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from datasets import Dataset


#path= "/work/so87pot/material_db/qmof_text/qmof_filtered_text.json"
path="/work/so87pot/material_db/lp_dataset/train_matbench_log_gvrh_0.json"
dataset = load_dataset("json", data_files=path,split="train")

# Assuming you have your dataset stored in a variable called 'dataset'

# Step 1: Normalize the features
scaler = MinMaxScaler()
composition_energy_reshaped = np.array(dataset['composition_energy']).reshape(-1, 1)
geometry_energy_reshaped = np.array(dataset['geometry_energy']).reshape(-1, 1)
normalized_features = scaler.fit_transform(composition_energy_reshaped)
normalized_geometry = scaler.fit_transform(geometry_energy_reshaped)

# Convert normalized features and geometry to lists
normalized_features_list = normalized_features.tolist()
normalized_geometry_list = normalized_geometry.tolist()

# Create a new dataset with the normalized features and geometry
# Convert new_dataset to a dictionary
new_dataset_dict = new_dataset.to_dict()

alphas = [0,0.2,0.4,0.5,0.6,0.8,1]
# Make the necessary modifications
for alpha in alphas:
    total_energy = alpha * np.array(new_dataset_dict['composition_energy_normalized']) + (1 - alpha) * np.array(new_dataset_dict['geometry_energy_normalized'])
    new_dataset_dict[f'total_energy_alpha_{alpha}'] = total_energy.tolist()

# Convert the modified dictionary back to a Dataset object
new_dataset = Dataset.from_dict(new_dataset_dict)
new_dataset.to_json("/work/so87pot/material_db/lp/train_matbench_log_gvrh_0.json")


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

494788

In [None]:
new_dataset['geometry_energy_normalized']

[[0.6817050603619018],
 [0.6832539118012717],
 [0.6842785976450603],
 [0.6794070332996888],
 [0.6733662975349656],
 [0.680205568966865],
 [0.6690141781422828],
 [0.6591256340624938],
 [0.6851457657439479],
 [0.6780207688860893],
 [0.4416425234798206],
 [0.6840397233288406],
 [0.686171819838583],
 [0.6764958461566817],
 [0.6638136016926822],
 [0.5928991674959536],
 [0.6861804825336201],
 [0.6845211807167846],
 [0.675972819122247],
 [0.6843235207413073],
 [0.5306019824753044],
 [0.68568199546247],
 [0.672685423022903],
 [0.676852009461932],
 [0.49726634426904104],
 [0.6855408085930039],
 [0.6774824486678793],
 [0.6310149934668796],
 [0.620228465864741],
 [0.6457621858128997],
 [0.6832289390343681],
 [0.6155453606311316],
 [0.5983700586114586],
 [0.6862405552129787],
 [0.6822615253921588],
 [0.6126113806566685],
 [0.6538111925429089],
 [0.5945029972340492],
 [0.6856932917563006],
 [0.6747037114643399],
 [0.677106056038349],
 [0.6835242350200174],
 [0.6203430146612877],
 [0.639552799921015

new_dataset['geometry_energy_normalized']

In [None]:
new_dataset['composition_energy_normalized']

[[0.049311583358275964],
 [0.054774019754564494],
 [0.26002693804250226],
 [0.11284046692607003],
 [0.16761448668063447],
 [0.14112541155342706],
 [0.0323256510026938],
 [0.08635139179886261],
 [0.09510625561209217],
 [0.2529182879377432],
 [0.05462436396288537],
 [0.07609997006884166],
 [0.026788386710565694],
 [0.2059263693504938],
 [0.04467225381622269],
 [0.0767734211313978],
 [0.006734510625561209],
 [0.008979347500748278],
 [0.1724034720143669],
 [0.1533971864711164],
 [0.14479197844956598],
 [0.13020053876085003],
 [0.3561807841963483],
 [0.20023944926668658],
 [0.045121221191260094],
 [0.07969170906914096],
 [0.02297216402274767],
 [0.07737204429811435],
 [0.041230170607602507],
 [0.04893744387907813],
 [0.05522298712960191],
 [0.1596079018258006],
 [0.5217000897934745],
 [0.04152948219096078],
 [0.03674049685722837],
 [0.03299910206524991],
 [0.04759054175396588],
 [0.2797066746483089],
 [0.07662376533971864],
 [0.16132894343011073],
 [0.3618677042801557],
 [0.2181981442681831

In [None]:
new_dataset['total_energy_alpha_0']

[[0.6817050603619018],
 [0.6832539118012717],
 [0.6842785976450603],
 [0.6794070332996888],
 [0.6733662975349656],
 [0.680205568966865],
 [0.6690141781422828],
 [0.6591256340624938],
 [0.6851457657439479],
 [0.6780207688860893],
 [0.4416425234798206],
 [0.6840397233288406],
 [0.686171819838583],
 [0.6764958461566817],
 [0.6638136016926822],
 [0.5928991674959536],
 [0.6861804825336201],
 [0.6845211807167846],
 [0.675972819122247],
 [0.6843235207413073],
 [0.5306019824753044],
 [0.68568199546247],
 [0.672685423022903],
 [0.676852009461932],
 [0.49726634426904104],
 [0.6855408085930039],
 [0.6774824486678793],
 [0.6310149934668796],
 [0.620228465864741],
 [0.6457621858128997],
 [0.6832289390343681],
 [0.6155453606311316],
 [0.5983700586114586],
 [0.6862405552129787],
 [0.6822615253921588],
 [0.6126113806566685],
 [0.6538111925429089],
 [0.5945029972340492],
 [0.6856932917563006],
 [0.6747037114643399],
 [0.677106056038349],
 [0.6835242350200174],
 [0.6203430146612877],
 [0.639552799921015