In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 \
    pandas --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

[0m

In [3]:
import pandas as pd
from datasets import Dataset  # Import the Dataset class
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
import torch
import time

from datasets import Dataset

In [4]:

# Function to add instructions to the dataset
def add_instructions(example):
    instruction = "Please summarize the following dialogue:"
    # Prepend the instruction to the 'dialogue' text
    example['dialogue'] = f"{instruction} {example['dialogue']}"
    return example

# Load your dataset from the Excel file
excel_path = 'NIAA RGS _ GIM changes_Edited.xlsx'
df = pd.read_excel(excel_path)

# Assuming your DataFrame columns are 'dialogue' and 'summary'
# Let's first drop the rows where 'dialogue' or 'summary' column is NaN
df = df.dropna(subset=['dialogue', 'summary '])

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Add instructions to each example in the dataset
dataset_with_instructions = dataset.map(add_instructions)

# The tokenizer you're using
model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_and_encode(examples):
    # Tokenize the inputs and labels
    tokenized_inputs = tokenizer(examples['dialogue'], padding='max_length', truncation=True, max_length=512)
    tokenized_labels = tokenizer(examples['summary '], padding='max_length', truncation=True, max_length=512)

    # Hugging Face expects the labels to be named 'labels', not 'input_ids'
    tokenized_labels["labels"] = tokenized_labels["input_ids"]
    # We don't need to compute loss for padding tokens
    tokenized_labels["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels] for labels in tokenized_labels["labels"]
    ]

    # Return the tokenized inputs and labels
    return {"input_ids": tokenized_inputs["input_ids"], "attention_mask": tokenized_inputs["attention_mask"], "labels": tokenized_labels["labels"]}

# Apply the tokenization and encoding function to the dataset
tokenized_dataset = dataset_with_instructions.map(tokenize_and_encode, batched=True)

# Now, tokenized_dataset is ready to be used for training


Map:   0%|          | 0/27 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/27 [00:00<?, ? examples/s]

In [5]:
# Check if a GPU is available and set it as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
import torch
from transformers import AutoModelForSeq2SeqLM, Trainer, TrainingArguments, TrainerCallback

# Load the model using torch.no_grad() to save memory
with torch.no_grad():
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define a callback to print the loss
class PrintLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero:
            if logs is not None and 'loss' in logs:
                print(f"Step: {state.global_step}, Loss: {logs['loss']}")


# Move the model to GPU after it's loaded to avoid doubling the memory usage
model = model.to(device)

# Setup training arguments
training_args = TrainingArguments(
    output_dir='./model_output',
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Further reduce the batch size
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,  # Use mixed precision training
    save_strategy="no",  # Disable model checkpointing
)

# Initialize Trainer with the callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Your tokenized dataset variable
    callbacks=[PrintLossCallback]
)

# Train the model
trainer.train()



Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0


Step: 10, Loss: 0.0
Step: 20, Loss: 0.0
Step: 30, Loss: 0.0
Step: 40, Loss: 0.0
Step: 50, Loss: 0.0
Step: 60, Loss: 0.0
Step: 70, Loss: 0.0
Step: 80, Loss: 0.0


TrainOutput(global_step=81, training_loss=0.0, metrics={'train_runtime': 14.8556, 'train_samples_per_second': 5.452, 'train_steps_per_second': 5.452, 'total_flos': 55465345548288.0, 'train_loss': 0.0, 'epoch': 3.0})