In [3]:
!module load nvidia/cuda/12.1.0

In [1]:
from trl import SFTTrainer

2024-04-27 18:00:08.852678: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-27 18:00:10.249069: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-27 18:00:10.249311: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-27 18:00:10.379683: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-27 18:00:11.081331: I tensorflow/core/platform/cpu_feature_guar

In [2]:

import os
import numpy as np
import pandas as pd


from torch.utils.data import Dataset
from peft import (
    LoraConfig, 
    get_peft_model, 
)

from functools import partial
from typing import Any, Dict, List

import torch
import wandb
from datasets import DatasetDict, load_dataset
from omegaconf import DictConfig

from torch import nn

from transformers import (
    LlamaTokenizer, 
    LlamaForSequenceClassification,
    AutoTokenizer,
    Trainer, 
    TrainingArguments,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback,
    TrainerCallback,
)

from structllm.models.utils import CustomWandbCallback_FineTune, EvaluateFirstStepCallback


In [3]:
IGNORE_INDEX = -100
MAX_LENGTH = 2048
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"


pretrained_ckpt = "meta-llama/Llama-2-7b-hf"

llama_tokenizer = LlamaTokenizer.from_pretrained(
    pretrained_ckpt,
    model_max_length=MAX_LENGTH,
    padding_side="right",
    use_fast=False,
    )
special_tokens_dict = dict()
if llama_tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if llama_tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if llama_tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if llama_tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

num_new_tokens = llama_tokenizer.add_special_tokens(special_tokens_dict)
llama_tokenizer.add_special_tokens(special_tokens_dict)
   


0

In [4]:
len(llama_tokenizer)

32001

In [5]:
def _tokenize(examples):
    # Tokenize the 'crystal_llm' column using the LAMA tokenizer
    tokenized_examples = llama_tokenizer(examples["crystal_llm_rep"],truncation=True, padding=True,)
    return tokenized_examples

def _prepare_datasets(path: str) -> DatasetDict:
        """
        Prepare training and validation datasets.

        Args:
            train_df (pd.DataFrame): DataFrame containing training data.

        Returns:
            DatasetDict: Dictionary containing training and validation datasets.
        """

        ds = load_dataset("json", data_files=path,split="train")
        dataset = ds.train_test_split(shuffle=True, test_size=0.2, seed=42)
        return dataset.map(_tokenize, batched=True)

In [6]:
dataset = _prepare_datasets("/work/so87pot/material_db/matbench_sml/train_matbench_log_kvrh_0.json")

Map:   0%|          | 0/4858 [00:00<?, ? examples/s]

Map:   0%|          | 0/1215 [00:00<?, ? examples/s]

In [9]:
dataset['train']['labels']

[2.2304489214,
 1.9138138524,
 2.1553360375,
 1.4471580313,
 1.6434526765,
 1.8750612634,
 1.9190780924,
 1.8750612634,
 2.4471580313,
 2.2671717284,
 1.9395192526,
 1.3010299957,
 1.531478917,
 1.6434526765,
 2.0492180227,
 2.4132997641,
 1.414973348,
 2.2810333672,
 2.2121876044,
 1.9395192526,
 1.9190780924,
 1.7242758696,
 1.6720978579,
 1.7634279936,
 1.8920946027,
 1.5440680444,
 2.1303337685,
 1.6127838567,
 2.1522883444,
 1.7634279936,
 2.0170333393,
 1.6434526765,
 2.0043213738,
 2.1613680022,
 2.0569048513,
 1.6532125138,
 1.7781512504,
 2.1931245984,
 2.1367205672,
 1.3802112417,
 1.3222192947,
 1.6627578317,
 2.1430148003,
 1.6434526765,
 2.2552725051,
 1.8512583487,
 1.5563025008,
 2.2329961104,
 2.2695129442,
 0.4771212547,
 2.6031443726,
 1.7993405495,
 2.3404441148,
 1.9190780924,
 1.7993405495,
 2.252853031,
 2.2227164711,
 1.8325089127,
 2.0863598307,
 1.4771212547,
 1.4623979979,
 1.1461280357,
 2.103803721,
 1.8750612634,
 2.2013971243,
 1.5051499783,
 1.3424226808,

In [None]:
def format_dolly(sample):
    instruction = f"<s>[INST] {sample['instruction']}"
    context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None
    response = f" [/INST] {sample['response']}"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{llama_tokenizer.eos_token}"
    return sample

In [None]:
# Shuffle the dataset
dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
dataset = dataset_shuffled.select(range(50))

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
dataset