In [None]:
# Install dependencies
!pip install transformers datasets

# Import necessary libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the pre-trained T5 model and tokenizer
model_name = "t5-small"  # Using a small version for faster training
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Load a small sample of the LAMBADA dataset
# We're using 'train[:100]' to get only the first 100 examples for quick testing
dataset = load_dataset("lambada", split="train[:20]")

# Preprocess the dataset
# LAMBADA involves predicting the last word, so we split text into input and target
def preprocess_function(examples):
    inputs = [text.rsplit(" ", 1)[0] for text in examples["text"]]  # All except the last word
    labels = [text.rsplit(" ", 1)[-1] for text in examples["text"]]  # Only the last word

    # Tokenize the inputs and labels
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(labels, max_length=10, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Set up training arguments
# Set up training arguments with W&B disabled
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,  # Set to 1 epoch for quick testing
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Disable Weights & Biases
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Using the same sample for evaluation in this test
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5_lambada")
tokenizer.save_pretrained("./fine_tuned_t5_lambada")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/20 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,13.617246


Evaluation results: {'eval_loss': 13.6172456741333, 'eval_runtime': 11.5971, 'eval_samples_per_second': 1.725, 'eval_steps_per_second': 0.259, 'epoch': 1.0}


('./fine_tuned_t5_lambada/tokenizer_config.json',
 './fine_tuned_t5_lambada/special_tokens_map.json',
 './fine_tuned_t5_lambada/spiece.model',
 './fine_tuned_t5_lambada/added_tokens.json')

In [None]:
from google.colab import files

# Upload the files (you will get a prompt to select them)
uploaded = files.upload()


Saving qa1_test.txt to qa1_test.txt
Saving qa1_train.txt to qa1_train.txt


In [None]:
# Import required libraries
import nltk
nltk.download('punkt')  # Download the 'punkt' tokenizer data

from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import os
from nltk.tokenize import word_tokenize
from datasets import Dataset
from functools import reduce

# Initialize the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Function to tokenize the sentence
def tokenize(sentence):
    return word_tokenize(sentence)

# Function to parse the stories in bAbI format
def parse_stories(lines, only_supporting=False):
    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

# Function to load and parse stories
def get_stories(f, only_supporting=False, max_length=None):
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data

# T5-specific preprocessing function to format the input as "question: <query> context: <context>"
def preprocess_babi_t5(stories):
    inputs = []
    labels = []
    for story, query, answer in stories:
        # Flatten the context (substory)
        context = ' '.join(story)

        # Format input as "question: <query> context: <context>"
        input_text = f"question: {' '.join(query)} context: {context}"

        # Append input and target
        inputs.append(input_text)
        labels.append(answer)  # The answer is directly used as the target

    # Tokenize inputs and labels using T5 tokenizer
    tokenized_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    tokenized_labels = tokenizer(labels, max_length=32, truncation=True, padding="max_length").input_ids

    # Create a dictionary with input IDs and labels
    tokenized_inputs['labels'] = tokenized_labels
    return tokenized_inputs

# Check that files were uploaded correctly
print(os.listdir())  # You should see 'qa1_train.txt' and 'qa1_test.txt'

# Set file paths
train_file_path = "qa1_train.txt"
test_file_path = "qa1_test.txt"

# Parse and load the stories from the train and test files
with open(train_file_path, 'r') as train_file:
    train_stories = get_stories(train_file)

with open(test_file_path, 'r') as test_file:
    test_stories = get_stories(test_file)

train_stories = train_stories[:50]  # Use only the first 100 examples for training
test_stories = test_stories[:50]

# Preprocess the stories for T5
tokenized_train = preprocess_babi_t5(train_stories)
tokenized_test = preprocess_babi_t5(test_stories)

# Convert tokenized data to Hugging Face Dataset object for the Trainer
train_dataset = Dataset.from_dict(tokenized_train)
test_dataset = Dataset.from_dict(tokenized_test)

# Define training arguments for T5 fine-tuning
training_args = TrainingArguments(
    output_dir='./results_babi_t5',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Set the number of epochs
    weight_decay=0.01,
    logging_dir='./logs_babi',
    logging_steps=10,
    report_to="none",  # Disable W&B logging
)


# Initialize the Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model on the test set
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5_babi")
tokenizer.save_pretrained("./fine_tuned_t5_babi")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['.config', 'results_babi_t5', 'logs_babi', 'logs', 'fine_tuned_t5_lambada', 'qa1_test.txt', 'results', 'qa1_train.txt', 'sample_data']


Epoch,Training Loss,Validation Loss
1,No log,13.17113
2,16.769400,6.27549
3,7.236500,4.47553


Evaluation results: {'eval_loss': 4.475529670715332, 'eval_runtime': 31.3665, 'eval_samples_per_second': 1.594, 'eval_steps_per_second': 0.223, 'epoch': 3.0}


('./fine_tuned_t5_babi/tokenizer_config.json',
 './fine_tuned_t5_babi/special_tokens_map.json',
 './fine_tuned_t5_babi/spiece.model',
 './fine_tuned_t5_babi/added_tokens.json')

In [None]:
from datasets import Dataset, load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import os

# Load the ARC-Easy dataset from Hugging Face
dataset = load_dataset("ai2_arc", "ARC-Easy")

# Initialize T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Function to preprocess the ARC dataset for T5, handling both numeric and alphabetical answer keys
def preprocess_arc(examples):
    inputs = []
    labels = []

    for i in range(len(examples["question"])):
        question = examples["question"][i]
        choices = examples["choices"][i]["text"]
        answer_key = examples["answerKey"][i]

        # Handle numeric answer keys (e.g., '1', '2', '3') and convert to 0-indexed numbers
        if answer_key.isdigit():
            answer_index = int(answer_key) - 1  # Convert to zero-indexed
        # Handle alphabetical answer keys (e.g., 'A', 'B', 'C', 'D')
        elif answer_key.isalpha():
            answer_index = ord(answer_key.upper()) - ord('A')  # Convert 'A' -> 0, 'B' -> 1, etc.
        else:
            print(f"Warning: Invalid answer key '{answer_key}' for question '{question}'. Skipping this entry.")
            continue

        # Ensure the answer index is valid
        if answer_index >= len(choices):
            print(f"Warning: answer index out of range for question '{question}' with choices {choices}. Skipping this entry.")
            continue

        # Create the input format for T5
        input_text = f"question: {question} choices: {' '.join(choices)}"
        correct_answer = choices[answer_index]

        # Append input and correct answer
        inputs.append(input_text)
        labels.append(correct_answer)

    # Tokenize without padding
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding=False)
    labels = tokenizer(labels, max_length=32, truncation=True, padding=False).input_ids

    model_inputs["labels"] = labels
    return model_inputs

# Limit the dataset to a small sample
train_sample_size = 100  # Select first 100 samples from the training set
validation_sample_size = 50  # Select first 50 samples from the validation set

train_data_sample = dataset['train'].select(range(train_sample_size))
validation_data_sample = dataset['validation'].select(range(validation_sample_size))

# Manually preprocess the train and validation data with the selected small sample
train_processed = preprocess_arc(train_data_sample)
validation_processed = preprocess_arc(validation_data_sample)

# Convert the preprocessed data into a Hugging Face dataset
train_data = Dataset.from_dict(train_processed)
validation_data = Dataset.from_dict(validation_processed)

# Data collator for padding during training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest", return_tensors="pt")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_arc_t5",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_arc",
    logging_steps=10,
    report_to="none",  # Disable W&B logging
)

# Initialize Trainer with the data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    data_collator=data_collator,  # Use data collator to pad dynamically
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5_arc_easy")
tokenizer.save_pretrained("./fine_tuned_t5_arc_easy")


Epoch,Training Loss,Validation Loss
1,1.525,1.145553
2,1.1461,0.968289
3,0.8884,0.927985


Evaluation results: {'eval_loss': 0.9279851317405701, 'eval_runtime': 6.2056, 'eval_samples_per_second': 8.057, 'eval_steps_per_second': 1.128, 'epoch': 3.0}


('./fine_tuned_t5_arc_easy/tokenizer_config.json',
 './fine_tuned_t5_arc_easy/special_tokens_map.json',
 './fine_tuned_t5_arc_easy/spiece.model',
 './fine_tuned_t5_arc_easy/added_tokens.json')

In [None]:
# Install dependencies
!pip install transformers datasets accelerate

# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the pre-trained LLaMA model and tokenizer
model_name = "meta-llama/Llama-1b"  # Ensure you have access to this specific LLaMA model checkpoint
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load a small sample of the LAMBADA dataset
# We're using 'train[:20]' to get only the first 20 examples for quick testing
dataset = load_dataset("lambada", split="train[:20]")

# Preprocess the dataset
# LAMBADA involves predicting the last word, so we split text into input and target
def preprocess_function(examples):
    inputs = [text.rsplit(" ", 1)[0] for text in examples["text"]]  # All except the last word
    labels = [text.rsplit(" ", 1)[-1] for text in examples["text"]]  # Only the last word

    # Tokenize the inputs and labels
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(labels, max_length=10, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Set up training arguments
# Set up training arguments with W&B disabled
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,  # Adjusted for larger model
    per_device_eval_batch_size=4,
    num_train_epochs=1,  # Set to 1 epoch for quick testing
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Disable Weights & Biases
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Using the same sample for evaluation in this test
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_llama_lambada")
tokenizer.save_pretrained("./fine_tuned_llama_lambada")


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: meta-llama/Llama-1b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`