In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
import torch

In [2]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.




In [3]:
from datasets import Dataset

In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments




In [5]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
import pandas as pd

In [7]:
train_data=pd.read_csv('samsum-train.csv')
validation_data=pd.read_csv('samsum-validation.csv')

In [8]:
train_data=train_data.sample(n=500,random_state=42).reset_index(drop=True)
validation_data=validation_data.sample(n=100,random_state=42).reset_index(drop=True)

In [9]:
import re
def clean_text(text):
    text=re.sub(r'\r\n',' ',text)
    text=re.sub(r'\s+',' ',text)
    text=re.sub(r'<.*?>','',text)
    text=text.strip().lower()
    return text

In [10]:
train_data['dialogue']=train_data['dialogue'].apply(clean_text)
train_data['summary']=train_data['summary'].apply(clean_text)
validation_data['dialogue']=validation_data['dialogue'].apply(clean_text)
validation_data['summary']=validation_data['summary'].apply(clean_text)

In [11]:
def prepocessing_function(examples):
  inputs=tokenizer(examples['dialogue'],max_length=256,padding='max_length',truncation=True)
  targets=tokenizer(examples['summary'],max_length=64,padding='max_length',truncation=True)
  inputs['labels']=targets['input_ids']
  return inputs

train_dataset=train_data.apply(prepocessing_function,axis=1)
val_dataset=validation_data.apply(prepocessing_function,axis=1)

In [12]:
train_dataset = Dataset.from_pandas(train_data).map(prepocessing_function, batched=True)
val_dataset = Dataset.from_pandas(validation_data).map(prepocessing_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
model=T5ForConditionalGeneration.from_pretrained('t5-small')

In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=0,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=0,
    evaluation_strategy="no",
    logging_dir='./logs',
    disable_tqdm=False
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)



In [15]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,9.5637
20,6.65
30,3.9521
40,2.9408
50,2.3542
60,2.2069


TrainOutput(global_step=63, training_loss=4.500904643346393, metrics={'train_runtime': 255.8856, 'train_samples_per_second': 1.954, 'train_steps_per_second': 0.246, 'total_flos': 33835450368000.0, 'train_loss': 4.500904643346393, 'epoch': 1.0})

In [16]:
model.save_pretrained("./saved_summary_model")
tokenizer.save_pretrained("./saved_summary_model")

('./saved_summary_model\\tokenizer_config.json',
 './saved_summary_model\\special_tokens_map.json',
 './saved_summary_model\\spiece.model',
 './saved_summary_model\\added_tokens.json')

In [17]:
model = T5ForConditionalGeneration.from_pretrained("./saved_summary_model")
tokenizer = T5Tokenizer.from_pretrained("./saved_summary_model")

In [18]:
device = torch.device('cpu')

In [21]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [22]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [23]:
def summarize_dialogue(dialogue):
    cleaned = clean_text(dialogue)
    input_text = "summarize: " + cleaned  # Important for T5
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=64,
            num_beams=4,
            early_stopping=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [24]:
sample_dialogue = """
Violet: Hey Claire! I was reading an article about Austin and thought you might find it interesting! 
Violet: It's about the current trends in urban development and how cities are planning for the future.
Violet: Here, let me share the link: <file_other>
Claire: Oh wow, that sounds like an insightful read. But I've actually already read that one last week. 
Claire: It was really interesting though, especially the part about sustainable architecture in cities. 
Claire: You know, I've been following these urban planning discussions for a while now.
Violet: Oh, I didn’t know that! Well, I’ll look for something else then, maybe something about eco-friendly cities or tech innovations.
Claire: That would be awesome! Let me know if you find something cool.
Violet: Sure, I’ll keep you posted. Thanks for the feedback!
"""

# Generate summary
summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)

Summary: 


In [25]:
train_data.shape

(500, 3)