In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import pandas as pd
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, T5ForConditionalGeneration, T5Tokenizer
from datasets import Dataset
import warnings
warnings.filterwarnings("ignore")
from peft import LoraConfig, get_peft_model

# Initialize the causal model and tokenizer
model_name = "google-t5/t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


X_train = pd.read_csv("/content/drive/MyDrive/summarizer/assets/data/X_train.csv")
y_train = pd.read_csv("/content/drive/MyDrive/summarizer/assets/data/y_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/summarizer/assets/data/X_test.csv")
y_test = pd.read_csv("/content/drive/MyDrive/summarizer/assets/data/y_test.csv")
X_eval = pd.read_csv("/content/drive/MyDrive/summarizer/assets/data/X_eval.csv")
y_eval = pd.read_csv("/content/drive/MyDrive/summarizer/assets/data/y_eval.csv")

train_data = pd.DataFrame({'review': X_train['X'], 'summary': y_train['y']})
test_data = pd.DataFrame({'review': X_test['X'], 'summary': y_test['y']})
eval_data = pd.DataFrame({'review': X_eval['X'], 'summary': y_eval['y']})

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
eval_dataset = Dataset.from_pandas(eval_data)

def create_tokens(examples):
    inputs = examples['review']
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(create_tokens, batched=True)
test_dataset = test_dataset.map(create_tokens, batched=True)
eval_dataset = eval_dataset.map(create_tokens, batched=True)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [15]:
lora_config = LoraConfig(
    r=8,                  # LoRA rank
    lora_alpha=32,        # LoRA alpha
    lora_dropout=0.1,     # Dropout for LoRA layers
    target_modules=["q", "v"],  # Modules to apply LoRA to
    task_type="SEQ_2_SEQ_LM"    # Task type for the LoRA adapter
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

model.to("cuda")

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/summarizer/summarizer-results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=12,
    num_train_epochs=50,
    weight_decay=0.0,
    logging_dir='/content/drive/MyDrive/summarizer/summarizer-logs',
    save_total_limit=2,
    save_strategy="epoch",
    gradient_accumulation_steps=2,
    max_grad_norm=1,
    fp16=True,
    report_to="none",
    load_best_model_at_end=True
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping_callback]
)

trainer.train()

print(trainer.evaluate(test_dataset))

save_directory = "/content/drive/MyDrive/summarizer/summarizer-results"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Epoch,Training Loss,Validation Loss
1,No log,7.579954
2,No log,5.740186
3,No log,2.769222
4,No log,1.74232
5,No log,1.700739
6,No log,1.654051
7,No log,1.593879
8,No log,1.555876
9,No log,1.537042
10,No log,1.524965


{'eval_loss': 1.4279675483703613, 'eval_runtime': 1.0126, 'eval_samples_per_second': 35.552, 'eval_steps_per_second': 2.963, 'epoch': 22.0}


('/content/drive/MyDrive/summarizer/summarizer-results/tokenizer_config.json',
 '/content/drive/MyDrive/summarizer/summarizer-results/special_tokens_map.json',
 '/content/drive/MyDrive/summarizer/summarizer-results/spiece.model',
 '/content/drive/MyDrive/summarizer/summarizer-results/added_tokens.json')

In [82]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Check if GPU is available and move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Summarization function
def summarize(text):
    # Tokenize input and move to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(
        input_ids=inputs['input_ids'],
        max_length=128,
        min_length=24,
        length_penalty=2.0,
        repetition_penalty=2.0,
        num_beams=2,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
text = test_dataset[20]['review']
summary = summarize(text)
print(summary)


This product is a bust. It does not break open, but it does work for me. I would consider buying this for that purpose.


In [84]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=1c28706a42223318732d96e797687c00e7e28a8206b8460bd1aa9537fa8c958c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [85]:
from rouge_score import rouge_scorer
from tqdm import tqdm

# ROUGE scorer setup
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Lists to store scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Evaluate on test data
for sample in tqdm(test_dataset):
    review = sample['review']
    reference_summary = sample['summary']

    # Generate summary
    inputs = tokenizer(review, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(
        input_ids=inputs['input_ids'],
        max_length=128,
        min_length=24,
        length_penalty=2.0,
        repetition_penalty=2.0,
        num_beams=2,
        early_stopping=True
    )
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Calculate ROUGE scores
    scores = scorer.score(reference_summary, generated_summary)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate and display average scores
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print(f"Average ROUGE-1 F1 score: {avg_rouge1:.4f}")
print(f"Average ROUGE-2 F1 score: {avg_rouge2:.4f}")
print(f"Average ROUGE-L F1 score: {avg_rougeL:.4f}")


100%|██████████| 36/36 [00:53<00:00,  1.48s/it]

Average ROUGE-1 F1 score: 0.3572
Average ROUGE-2 F1 score: 0.0841
Average ROUGE-L F1 score: 0.2146



