**Library**

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **Clean and preprocess the ‘Text’ and ‘Summary’ column from the dataset**

In [2]:
# Load stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords and extra spaces
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Read the CSV file
df = pd.read_csv('/kaggle/input/cse508-winter2024-a4-data/Reviews.csv')

# Clean 'Text' column
df['Text'] = df['Text'].apply(clean_text)

# Clean 'Summary' column
df['Summary'] = df['Summary'].apply(clean_text)

# Generate new CSV file with cleaned data
df.to_csv('/kaggle/working/cleaned_data.csv', index=False)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


# **Model Training**

**Installation**

In [3]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Note: you may need to restart the kernel to use updated packages.


**Library**

In [4]:
import torch
import random
import pandas as pd
from tqdm import tqdm
from rouge import Rouge
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

**Divide the dataset into training and testing (75:25)**

In [7]:
df = pd.read_csv('/kaggle/working/cleaned_data.csv').head(10000)

select_column = df[['Score','Text', 'Summary']]

# Split the dataset into training and testing sets (75:25 ratio)
training_df, testing_df = train_test_split(select_column, test_size=0.25, random_state=42)

**Defining Custom Dataset Class**

In [9]:
class CustomData(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        super().__init__()
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tokenizer.pad_token = self.tokenizer.eos_token
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        review_text = str(self.df.iloc[idx]['Text'])
        summary_text = str(self.df.iloc[idx]['Summary'])

        # Combine review text and summary text
        text = f"Review Text: {review_text}\nSummary: {summary_text}"

        # Tokenize the combined text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)

        # Convert score to tensor
        label = torch.tensor(self.df.iloc[idx]['Score'])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

**Training Loop**

In [10]:
# Instantiate GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define dataset and dataloader
training_dataset = CustomData(training_df, tokenizer, max_length=128)
train_loader = DataLoader(training_dataset, batch_size=10, shuffle=True)

# Define optimizer and scheduler
learning_rate = 1e-5
epochs = 3
warmup_steps = int(0.1 * len(train_loader) * epochs)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=len(train_loader) * epochs)

# Fine-tuning loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(epochs):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        # Backward pass
        loss.backward()
        # Update weights
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Save the fine-tuned model
model.save_pretrained("/kaggle/working/fine_tuned_gpt2_Model")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch 1/3, Loss: 2.988635778427124
Epoch 2/3, Loss: 1.8884292840957642
Epoch 3/3, Loss: 1.8239219188690186


**Generating Summaries**

In [36]:
def generate_summary(review_text):
    # Tokenize the review text
    inputs = tokenizer.encode_plus(
        review_text,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    )

    # Move input tensors to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

    # Generate summary using the model
    model = GPT2LMHeadModel.from_pretrained('/kaggle/working/fine_tuned_gpt2_Model').to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=1024, num_beams=4, early_stopping=True)

    # Decode the generated summary tokens
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return generated_summary

**ROUGE Calculation Function**

In [37]:
import csv
from rouge import Rouge

# Function to calculate ROUGE scores
def rougescore(generated_summary, actual_summary):
    rouge = Rouge()
    rougescore = rouge.get_scores(generated_summary, actual_summary)
    return rougescore

**Save CSV File**

In [39]:
# Read CSV file
csv_file = '/kaggle/working/cleaned_data.csv'
output_file = '/kaggle/working/rougescore.csv'  
num_rows = 200 

# Open output file in write mode
with open(output_file, mode='w', newline='', encoding='utf-8') as output_csv:
    csv_writer = csv.writer(output_csv)
    csv_writer.writerow(['Text', 'Generated Summary', 'ROUGE-1 Precision', 'ROUGE-1 Recall', 'ROUGE-1 F1',
                         'ROUGE-2 Precision', 'ROUGE-2 Recall', 'ROUGE-2 F1',
                         'ROUGE-L Precision', 'ROUGE-L Recall', 'ROUGE-L F1'])
    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for idx, row in enumerate(csv_reader):
            if idx >= num_rows:
                break  

            review_text = row['Text']
            actual_summary = row['Summary']  # Adjust column name

            # Skip rows with empty actual summary
            if not actual_summary:
                print(f"Skipping row {idx + 1} due to empty Summary.")
                continue

            # Generate summary
            gen_summary = generate_summary(review_text)
            splited_summary = gen_summary.split(review_text)
            generated_summary = splited_summary[1].strip()

            # Calculate ROUGE scores
            rouge_scores = rougescore(generated_summary, actual_summary)

            # Write results to output file
            csv_writer.writerow([review_text, generated_summary,
                                 rouge_scores[0]['rouge-1']['p'], rouge_scores[0]['rouge-1']['r'], rouge_scores[0]['rouge-1']['f'],
                                 rouge_scores[0]['rouge-2']['p'], rouge_scores[0]['rouge-2']['r'], rouge_scores[0]['rouge-2']['f'],
                                 rouge_scores[0]['rouge-l']['p'], rouge_scores[0]['rouge-l']['r'], rouge_scores[0]['rouge-l']['f']])

    print("ROUGE scores calculated and CSV File saved to", output_file)

ROUGE scores calculated and CSV File saved to /kaggle/working/rougescore.csv
