In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\tasnu\AppData\Roaming\Python\Python312\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

## Data preprocessing

In [None]:
df = pd.read_csv('C:\\Users\\tasnu\\OneDrive\\Documents\\EssenceAI\\summarizer_model\\gigadata_corpus.csv')

df.head()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
def convert_to_t5_format(df, input_column, target_column):
    inputs = "summarize: " + df[input_column]
    targets = df[target_column]
    return inputs.tolist(), targets.tolist()

train_inputs, train_targets = convert_to_t5_format(train_df, 'article', 'summarize')
test_inputs, test_targets = convert_to_t5_format(test_df, 'article', 'summarize')

## Data and model load

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

def tokenize_data(inputs, targets, tokenizer, max_length=512):
    input_encodings = tokenizer(inputs, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, max_length=150, padding=True, truncation=True, return_tensors="pt")
    return input_encodings, target_encodings

train_encodings, train_target_encodings = tokenize_data(train_inputs, train_targets, tokenizer)
test_encodings, test_target_encodings = tokenize_data(test_inputs, test_targets, tokenizer)

In [None]:
class TextSummarizationDataset(Dataset):
    def __init__(self, input_encodings, target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.input_encodings['input_ids'][idx]
        attention_mask = self.input_encodings['attention_mask'][idx]
        labels = self.target_encodings['input_ids'][idx]

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_dataset = TextSummarizationDataset(train_encodings, train_target_encodings)
test_dataset = TextSummarizationDataset(test_encodings, test_target_encodings)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

## Train model

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()
epoch_num = 5
for epoch in range(epoch_num):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

## Save model

In [None]:
model.save_pretrained("./t5_weights")
tokenizer.save_pretrained("./t5_tokenizer")

## Test

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load the saved model and tokenizer
model_path = "./t5_summarizer_model"
tokenizer_path = "./t5_summarizer_tokenizer"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()  # Set the model to evaluation mode


In [None]:
def summarize_paragraph(paragraph, max_length=150, min_length=30):
    # Tokenize the input text
    inputs = tokenizer("summarize: " + paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"], 
        max_length=max_length, 
        min_length=min_length, 
        length_penalty=2.0, 
        num_beams=4, 
        early_stopping=True
    )
    
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary


In [None]:
paragraph = """The COVID-19 pandemic has caused a global economic slowdown. Governments around the world are taking measures to address the crisis. Health systems are under immense pressure, and countries are introducing emergency protocols to handle the situation."""

summary = summarize_paragraph(paragraph)
print("Original Paragraph:", paragraph)
print("Summary:", summary)
