In [2]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

model_id = "meta-llama/Meta-Llama-3-8B"

In [3]:
# Load data from JSON
with open('./data/gpt3/inference_decoded_eng.json', 'r') as file:
    data = json.load(file)

In [4]:
# Prepare lists to store the extracted texts and completions
texts = []
completions = []

# Iterate through each entry in the data
for entry in data:
    # Iterate through each 'doping_sentence' in the current entry
    for doping_sentence in entry.get('doping_sentences', []):
        # Extract 'sentence_text' and 'llm_completion'
        text = doping_sentence.get('sentence_text', '')
        completion = doping_sentence.get('llm_completion', '')
        
        # Append to the lists
        texts.append(text)
        completions.append(completion)

# Create a DataFrame from the lists
df = pd.DataFrame({'sentence_text': texts, 'doping': completions})
df = df.dropna(subset=['doping'])

In [5]:
df.head()

Unnamed: 0,sentence_text,doping
0,Comparison of chemical bath-deposited ZnO film...,The host 'ZnO' was doped with 'Al'.\nThe host...
1,A comparative study is presented on chemical b...,The host 'ZnO' was doped with 'Al'.\nThe host...
2,The study reveals marked differences in dopant...,There is no doping information.\n
3,The presence of dopant in the solution induces...,There is no doping information.\n
4,"All films are (002)-textured, whereas the latt...",The host 'Zn' was doped.\n


In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Check if the tokenizer has a pad token
if tokenizer.pad_token is None:
    # Define a pad token (use the end of sentence token as pad token if needed)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Padding Token: {tokenizer.pad_token}")
print(f"Padding Token ID: {tokenizer.pad_token_id}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Padding Token: <|end_of_text|>
Padding Token ID: 128001


In [7]:
def preprocess_function(examples):
    # Tokenize the input sentences
    inputs = tokenizer(
        examples['sentence_text'], 
        max_length=128, 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"  
    )

    # Tokenize the labels
    labels = tokenizer(
        examples['doping'], 
        max_length=32,
        truncation=True, 
        padding="max_length",
        return_tensors="pt"  
    )

    # Ensure labels are not padded with -100 to ignore them in loss computation
    labels['input_ids'] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_list] 
        for label_list in labels['input_ids']
    ]

    # Convert inputs and labels to tensors
    inputs = {key: val for key, val in inputs.items()}
    labels = {key: val for key, val in labels.items()}
    
    inputs['labels'] = labels['input_ids']
    
    return inputs

from datasets import Dataset

# Convert the filtered DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Apply the preprocessing function
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
)

# Initialize the model
model = AutoModelForCausalLM.from_pretrained(model_id)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Use a separate validation set for better evaluation
)

# Start training
trainer.train()

: 

In [2]:
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained('./trained_model')
tokenizer = AutoTokenizer.from_pretrained('./trained_model')

# Example inference
inputs = tokenizer("Example sentence.", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))