In [9]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from datasets import Dataset
import pandas as pd
from trl import SFTTrainer
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

In [3]:
# Load data from JSON
with open('./data/gpt3/inference_decoded_eng.json', 'r') as file:
    data = json.load(file)

In [4]:
# Prepare lists to store the extracted texts and completions
texts = []
completions = []

# Iterate through each entry in the data
for entry in data:
    # Iterate through each 'doping_sentence' in the current entry
    for doping_sentence in entry.get('doping_sentences', []):
        # Extract 'sentence_text' and 'llm_completion'
        text = doping_sentence.get('sentence_text', '')
        completion = doping_sentence.get('llm_completion', '')
        
        # Append to the lists
        texts.append(text)
        completions.append(completion)

# Create a DataFrame from the lists
df = pd.DataFrame({'sentence_text': texts, 'doping': completions})
df = df.dropna(subset=['doping'])

In [5]:
df.head()

Unnamed: 0,sentence_text,doping
0,Comparison of chemical bath-deposited ZnO film...,The host 'ZnO' was doped with 'Al'.\nThe host...
1,A comparative study is presented on chemical b...,The host 'ZnO' was doped with 'Al'.\nThe host...
2,The study reveals marked differences in dopant...,There is no doping information.\n
3,The presence of dopant in the solution induces...,There is no doping information.\n
4,"All films are (002)-textured, whereas the latt...",The host 'Zn' was doped.\n


In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Check if the tokenizer has a pad token
if tokenizer.pad_token is None:
    # Define a pad token (use the end of sentence token as pad token if needed)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Padding Token: {tokenizer.pad_token}")
print(f"Padding Token ID: {tokenizer.pad_token_id}")

Padding Token: <|end_of_text|>
Padding Token ID: 128001


In [7]:
def preprocess_function(examples):
    # Tokenize the input sentences
    inputs = tokenizer(
        examples['sentence_text'], 
        max_length=128, 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"  
    )

    # Tokenize the labels
    labels = tokenizer(
        examples['doping'], 
        max_length=32,
        truncation=True, 
        padding="max_length",
        return_tensors="pt"  
    )

    # Ensure labels are not padded with -100 to ignore them in loss computation
    labels['input_ids'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_list] 
        for label_list in labels['input_ids']
    ]

    # Convert inputs and labels to tensors
    inputs = {key: val for key, val in inputs.items()}
    labels = {key: val for key, val in labels.items()}
    
    inputs['labels'] = labels['input_ids']
    
    return inputs

from datasets import Dataset

# Convert the filtered DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Apply the preprocessing function
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 670.38 examples/s]


In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

device = ('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    )
# model.to(device)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear",
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Use a separate validation set for better evaluation
    peft_config=peft_config,

)

# Start training
trainer.train()

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Downloading shards: 100%|██████████| 4/4 [03:13<00:00, 48.36s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.55s/it]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
    There is an imbalance between your GPUs. You may want to exclude GPU 4 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB. GPU 4 has a total capacity of 3.81 GiB of which 27.12 MiB is free. Including non-PyTorch memory, this process has 3.77 GiB memory in use. Of the allocated memory 3.19 GiB is allocated by PyTorch, and 21.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained('./trained_model')
tokenizer = AutoTokenizer.from_pretrained('./trained_model')

# Example inference
inputs = tokenizer("Example sentence.", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))