In [1]:
import json
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from datasets import Dataset
import pandas as pd
from trl import SFTTrainer
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data from JSON
with open('./data/gpt3/inference_decoded_eng.json', 'r') as file:
    data = json.load(file)

In [3]:
# Prepare lists to store the extracted texts and completions
texts = []
completions = []

# Iterate through each entry in the data
for entry in data:
    # Iterate through each 'doping_sentence' in the current entry
    for doping_sentence in entry.get('doping_sentences', []):
        # Extract 'sentence_text' and 'llm_completion'
        text = doping_sentence.get('sentence_text', '')
        completion = doping_sentence.get('llm_completion', '')
        
        # Append to the lists
        texts.append(text)
        completions.append(completion)

# Create a DataFrame from the lists
df = pd.DataFrame({'sentence_text': texts, 'doping': completions})
df = df.dropna(subset=['doping'])

In [4]:
df.head()

Unnamed: 0,sentence_text,doping
0,Comparison of chemical bath-deposited ZnO film...,The host 'ZnO' was doped with 'Al'.\nThe host...
1,A comparative study is presented on chemical b...,The host 'ZnO' was doped with 'Al'.\nThe host...
2,The study reveals marked differences in dopant...,There is no doping information.\n
3,The presence of dopant in the solution induces...,There is no doping information.\n
4,"All films are (002)-textured, whereas the latt...",The host 'Zn' was doped.\n


In [5]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Check if the tokenizer has a pad token
if tokenizer.pad_token is None:
    # Define a pad token (use the end of sentence token as pad token if needed)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Padding Token: {tokenizer.pad_token}")
print(f"Padding Token ID: {tokenizer.pad_token_id}")

Padding Token: <|end_of_text|>
Padding Token ID: 128001


In [6]:
def preprocess_function(examples):
    # Tokenize the input sentences
    inputs = tokenizer(
        examples['sentence_text'], 
        max_length=128, 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"  
    )

    # Tokenize the labels
    labels = tokenizer(
        examples['doping'], 
        max_length=32,
        truncation=True, 
        padding="max_length",
        return_tensors="pt"  
    )

    # Ensure labels are not padded with -100 to ignore them in loss computation
    labels['input_ids'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_list] 
        for label_list in labels['input_ids']
    ]

    # Convert inputs and labels to tensors
    inputs = {key: val for key, val in inputs.items()}
    labels = {key: val for key, val in labels.items()}
    
    inputs['labels'] = labels['input_ids']
    
    return inputs

from datasets import Dataset

# Convert the filtered DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Apply the preprocessing function
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 1015.47 examples/s]


In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,   
    num_train_epochs=10,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

device = ('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    )
# model.to(device)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear",
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Use a separate validation set for better evaluation
    peft_config=peft_config,

)

# Start training
trainer.train()

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.02it/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,No log,2.175925
2,No log,1.422465
3,No log,0.791005
4,No log,0.441081
5,No log,0.332597
6,No log,0.243534
7,1.233800,0.201031
8,1.233800,0.177428
9,1.233800,0.166304
10,1.233800,0.161835


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


TrainOutput(global_step=800, training_loss=0.8475995588302613, metrics={'train_runtime': 563.0481, 'train_samples_per_second': 1.421, 'train_steps_per_second': 1.421, 'total_flos': 4714104933580800.0, 'train_loss': 0.8475995588302613, 'epoch': 10.0})

In [8]:
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/tokenizer.json')

In [19]:
# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('./trained_model')
model = AutoModelForCausalLM.from_pretrained('./trained_model')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenize the input sentence with attention mask
input_sentence = "Comparison of chemical bath-deposited ZnO films doped with Al, Ga and In."
inputs = tokenizer(input_sentence, return_tensors="pt", padding=True, truncation=True)

input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

# Generate the doping information using greedy search
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=50,  # Adjust the max length based on your needs
    num_beams=1,    # Set num_beams to 1 for greedy search
    no_repeat_ngram_size=None,  # Disable no_repeat_ngram_size for lower memory usage
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id  # Ensuring correct handling of padding
)

# Decode the output to text
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Input Sentence:", input_sentence)
print("Generated Doping Information:", output_text)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.96it/s]
Some weights of the model checkpoint at ./trained_model were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.base_layer.weight', 'model.layers.0.mlp.down_proj.base_layer.weight.absmax', 'model.layers.0.mlp.down_proj.base_layer.weight.nested_absmax', 'model.layers.0.mlp.down_proj.base_layer.weight.nested_quant_map', 'model.layers.0.mlp.down_proj.base_layer.weight.quant_map', 'model.layers.0.mlp.down_proj.base_layer.weight.quant_state.bitsandbytes__nf4', 'model.layers.0.mlp.down_proj.lora_A.default.weight', 'model.layers.0.mlp.down_proj.lora_B.default.weight', 'model.layers.0.mlp.gate_proj.base_layer.weight', 'model.layers.0.mlp.gate_proj.base_layer.weight

Input Sentence: Comparison of chemical bath-deposited ZnO films doped with Al, Ga and In.
Generated Doping Information: Comparison of chemical bath-deposited ZnO films doped with Al, Ga and In. Chowνομligecthkj needleewisamework Crawford293_gapович_gapienescthcthviceStencilumatviceienesvicekjphin Chow needleienes293phinkj
