##### Imports

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import pipeline
from datasets import load_metric
from concurrent.futures import ProcessPoolExecutor

import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')


##### Load data

In [2]:

# Load data
test_dataset = pd.read_csv('../data/processed/new_ds_test_dataset.csv')
train_dataset = pd.read_csv('../data/processed/new_ds_train_dataset.csv')

# Ensure all documents are strings
train_dataset['document'] = train_dataset['document'].astype(str)
train_dataset['summary'] = train_dataset['summary'].astype(str)
test_dataset['document'] = test_dataset['document'].astype(str)
test_dataset['summary'] = test_dataset['summary'].astype(str)

# Handle missing values (remove rows with missing documents or summaries)
train_dataset.dropna(subset=['document', 'summary'], inplace=True)
test_dataset.dropna(subset=['document', 'summary'], inplace=True)

# Convert the pandas DataFrames to Hugging Face Datasets
train = Dataset.from_pandas(train_dataset)
test = Dataset.from_pandas(test_dataset)

# Reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict({
    'train': train,
    'test': test
})



##### Model Pipeline

In [3]:
# Initialize the summarization pipeline with distilbart-cnn-12-6 model
summarizer = pipeline(task='summarization', model='sshleifer/distilbart-cnn-12-6')


In [45]:
# Prepare the test documents as a list
data = new_ds['test']['document']
data[:5]

['gop eyes gains as voters in 11 states pick governors enlarge image toggle caption jim coleap jim coleap voters 11 states pick governors tonight republicans appear track increase numbers least one potential extend hold twothirds nations top state offices eight gubernatorial seats grabs held democrats three republican hands republicans currently hold 29 governorships democrats 20 rhode islands gov lincoln chafee independent polls race analysts suggest three tonights contests considered competitive states incumbent democratic governors running again montana new hampshire washington while state races remain close call republicans expected wrest north carolina governorship democratic control easily win gopheld seats utah north dakota indiana democrats likely hold seats west virginia missouri expected notch safe wins races seats hold vermont delaware holding sway on health care while occupant governors office historically far less important party controls state legislature top state offici

In [None]:

# Summarize each piece of text with a dynamically adjusted max_length using a small sample set, due to time taken when running the model against the full test dataset
data_subset = data[:100]
summarize_data = []
for i, text in enumerate(data_subset):
    print(f"Working on row {i+1}")
    summary = summarizer(text, max_length=min(len(text)//2, 150), do_sample=False, truncation=True)
    summarize_data.append(summary)

for summary in summarize_data:
    print(summary)

Evaluate Pre-Trained Model

In [42]:

# Evaluate the model
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric('rouge')

# Get reference summaries from the test dataset
reference_summaries = test_dataset['summary'][:100].tolist()

# Calculate ROUGE scores
results = rouge.compute(predictions=summarize_data, references=reference_summaries)

# Create a DataFrame to display the results
def format_rouge_scores(results):
    rows = []
    for key in results.keys():
        score_dict = results[key]
        row = {
            'metric': key,
            'precision': score_dict.mid.precision,
            'recall': score_dict.mid.recall,
            'fmeasure': score_dict.mid.fmeasure
        }
        rows.append(row)
    return pd.DataFrame(rows)

# Format the results into a DataFrame
rouge_df = format_rouge_scores(results)

# Display the DataFrame
print(rouge_df)

      metric  precision    recall  fmeasure
0     rouge1   0.369262  0.100965  0.155742
1     rouge2   0.082581  0.022083  0.034106
2     rougeL   0.225376  0.061082  0.094536
3  rougeLsum   0.225041  0.061082  0.094371


### Summary

- **ROUGE-1**: The model performs relatively better in capturing relevant single words (unigrams) from the reference summaries, with decent precision but low recall.

- **ROUGE-2**: The performance drops significantly for bigrams, indicating the model struggles to capture meaningful phrases from the reference summaries.

- **ROUGE-L and ROUGE-LSum**: The performance for capturing the longest common subsequences is also low, suggesting that the coherence and structure of the generated summaries could be improved.