In [1]:
from datasets import load_dataset
from datasets import DatasetDict

dataset = load_dataset("HamdanXI/beethoven_qa")

# Split the dataset into training and testing sets (90% train, 10% test)
train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Create a DatasetDict to keep the splits organized
dataset_split = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print(f"Training set size: {len(dataset_split['train'])}")
print(f"Testing set size: {len(dataset_split['test'])}")

# Preprocess the dataset
def preprocess_function(examples):
    inputs = ["question: " + q for q in examples["question"]]
    targets = ["answer: " + a for a in examples["answer"]]
    model_inputs = {"input_ids": inputs, "labels": targets}
    return model_inputs

tokenized_dataset = dataset_split.map(preprocess_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm


Training set size: 1215
Testing set size: 136


Map: 100%|██████████| 1215/1215 [00:00<00:00, 45045.43 examples/s]
Map: 100%|██████████| 136/136 [00:00<00:00, 9811.41 examples/s]


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
'''
def get_max_length(batch):
    max_length_input = max(len(tokenizer.encode(q)) for q in batch['question'])
    max_length_output = max(len(tokenizer.encode(a)) for a in batch['answer'])
    max_length = max(max_length_input, max_length_output)
    return max_length

max_pad = get_max_length(dataset['train'])

if max_pad > 512:
    max_pad = 512
    
print(max_pad)
'''

"\ndef get_max_length(batch):\n    max_length_input = max(len(tokenizer.encode(q)) for q in batch['question'])\n    max_length_output = max(len(tokenizer.encode(a)) for a in batch['answer'])\n    max_length = max(max_length_input, max_length_output)\n    return max_length\n\nmax_pad = get_max_length(dataset['train'])\n\nif max_pad > 512:\n    max_pad = 512\n    \nprint(max_pad)\n"

In [4]:
# Tokenize the inputs and labels
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_ids"], padding="max_length", truncation=True)
    labels = tokenizer(examples["labels"], padding="max_length", truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = tokenized_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 1215/1215 [00:01<00:00, 649.02 examples/s]
Map: 100%|██████████| 136/136 [00:00<00:00, 604.59 examples/s]


In [5]:
import torch

if torch.cuda.is_available():
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
    model = model.to("cuda")
else:
    print("CUDA is not available. Check your installation.")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

CUDA is available. Device: NVIDIA GeForce RTX 2070


  attn_output = torch.nn.functional.scaled_dot_product_attention(


OutOfMemoryError: CUDA out of memory. Tried to allocate 176.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 21.78 GiB is allocated by PyTorch, and 46.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model("TinyLlama-finetuned-beethoven-qa")
tokenizer.save_pretrained("TinyLlama-finetuned-beethoven-qa")