***Training***

In [None]:
import pandas as pd
df=pd.read_csv('/kaggle/input/akhil-ir-a4/preprocessed_data.csv')

In [2]:
df = df.sample(n=50000, random_state=42)

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

class ReviewSummaryDataset(Dataset):
    def __init__(self, tokenizer, reviews, summaries, max_length):
        self.tokenizer = tokenizer
        self.reviews = reviews
        self.summaries = summaries
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = self.reviews[idx]
        summary = self.summaries[idx]
        encodings = self.tokenizer.encode_plus(
            review, summary,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        labels = encodings.input_ids.detach().clone()
        labels[labels == tokenizer.pad_token_id] = -100
        return {"input_ids": encodings.input_ids.squeeze(), "attention_mask": encodings.attention_mask.squeeze(), "labels": labels.squeeze()}


# Remove rows with missing values
df = df.dropna()

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Text'], df['Summary'], test_size=0.25)

# Initialize dataset
train_dataset = ReviewSummaryDataset(tokenizer, train_texts.to_list(), train_labels.to_list(), max_length=512)
val_dataset = ReviewSummaryDataset(tokenizer, val_texts.to_list(), val_labels.to_list(), max_length=512)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="no",  
    logging_strategy="no",  
    warmup_steps=500,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Save the model
model_path = "saved_model"
model.save_pretrained(model_path)


***Evaluation***

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [2]:
import pandas as pd
df=pd.read_csv('preprocessed_data.csv')

In [3]:
df = df.sample(n=50000, random_state=42)

In [4]:
# Remove rows with missing values
df = df.dropna()

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Text'], df['Summary'], test_size=0.25)

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
from rouge_score import rouge_scorer

# Load the trained model and tokenizer
model_path = "model_kaggle"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')  # Adjust padding
tokenizer.pad_token = tokenizer.eos_token

# Setup device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_summary(text):
    # Encode text, ensuring padding is correctly managed
    encodings = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_tensors="pt",
        padding="max_length",
        truncation=True
    )
    inputs = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    # Correct handling of generation parameters
    max_new_tokens = 25  # Specify how many tokens to generate at most
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,  # Use max_new_tokens instead of max_length
        num_beams=5,
        early_stopping=True
    )
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Compute ROUGE scores and prepare CSV data
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
results = []

# Iterate over both texts and labels
for text, actual_summary in zip(val_texts, val_labels):
    generated_summary = generate_summary(text)
    scores = scorer.score(actual_summary, generated_summary)

    result = {
        "Text": text,
        "ROUGE-1 Precision": scores['rouge1'].precision,
        "ROUGE-1 Recall": scores['rouge1'].recall,
        "ROUGE-1 F1": scores['rouge1'].fmeasure,
        "ROUGE-2 Precision": scores['rouge2'].precision,
        "ROUGE-2 Recall": scores['rouge2'].recall,
        "ROUGE-2 F1": scores['rouge2'].fmeasure,
        "ROUGE-L Precision": scores['rougeL'].precision,
        "ROUGE-L Recall": scores['rougeL'].recall,
        "ROUGE-L F1": scores['rougeL'].fmeasure
    }
    results.append(result)

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('summary_rouge_scores.csv', index=False)
