In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer

In [2]:
ls ../

annot.opcorpora.xml  dict.opcorpora.xml  [0m[01;34mtask1[0m/
[01;34marchive[0m/             requirements.txt    [01;34mtask2[0m/


In [3]:
train_df = pd.read_csv("../archive/cnn_dailymail/train.csv")
test_df = pd.read_csv("../archive/cnn_dailymail/test.csv")
validation_df = pd.read_csv("../archive/cnn_dailymail/validation.csv")

In [4]:
print(train_df.describe())

                                              id  \
count                                     287113   
unique                                    287113   
top     0001d1afc246a7964130f43ae940af6bc6c57f01   
freq                                           1   

                                                  article  \
count                                              287113   
unique                                             284005   
top     (CNN) -- Dubai could lose its place on the Wom...   
freq                                                    3   

                                               highlights  
count                                              287113  
unique                                             282197  
top     This page includes the show Transcript and the...  
freq                                                   83  


In [5]:
# Data Cleaning and Preprocessing (text-based EDA)
# Remove any non-alphanumeric characters and extra whitespaces
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(' +', ' ', text)
    return text

train_df['article'] = train_df['article'].apply(clean_text)
train_df['highlights'] = train_df['highlights'].apply(clean_text)

In [6]:
from datasets import load_metric
from rouge_score import rouge_scorer

In [7]:
# Function to generate summaries
def generate_summary(article_text):
    inputs = tokenizer.encode("summarize: " + article_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [8]:
# # Function to calculate ROUGE scores
def calculate_rouge_scores(original_summary, generated_summary):  
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
    
    # Calculate ROUGE scores
    rouge_scores = scorer.score(generated_summary, original_summary)
    return rouge_scores

# def calculate_rouge_scores(original_summary, generated_summary):
#     rouge = load_metric("rouge")
#     scores = rouge.compute(predictions=[generated_summary], references=[original_summary])
#     return scores

## Bart open-LLM model

In [15]:
# Specify the BART model name
model_name = "facebook/bart-large-cnn"

# Load the BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [18]:
# Summarize and evaluate a single article from the test dataset
article = test_df.iloc[0]['article']
original_summary = test_df.iloc[0]['highlights']
generated_summary = generate_summary(article)
rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

print("Original Summary:")
print(original_summary)
print("\nGenerated Summary:")
print(generated_summary)

  rouge = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Original Summary:
Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .

Generated Summary:
U.S consumer advisory group set up by Department of Transportation said that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. Tests conducted by the FAA use planes with a 31 inch pitch, a standard which on some airlines has decreased.


In [19]:
# Print ROUGE scores line by line
for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {scores.precision}")
    print(f"Recall: {scores.recall}")
    print(f"F1 Score: {scores.fmeasure}")
    print()


rouge1:
Precision: 0.4117647058823529
Recall: 0.24561403508771928
F1 Score: 0.3076923076923077

rouge2:
Precision: 0.21212121212121213
Recall: 0.125
F1 Score: 0.15730337078651685

rougeL:
Precision: 0.35294117647058826
Recall: 0.21052631578947367
F1 Score: 0.2637362637362637

rougeLsum:
Precision: 0.38235294117647056
Recall: 0.22807017543859648
F1 Score: 0.28571428571428564



## T5 (Text-to-Text Transfer Transformer)

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [10]:
# Load the T5 model and tokenizer
model_name="t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
# Summarize and evaluate a single article from the test dataset
article = test_df.iloc[0]['article']
original_summary = test_df.iloc[0]['highlights']
generated_summary = generate_summary(article)
rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

print("Original Summary:")
print(original_summary)
print("\nGenerated Summary:")
print(generated_summary)

Original Summary:
Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .

Generated Summary:
some experts are questioning if shrinking space on planes is putting our health and safety in danger. this week, a consumer advisory group set up by the department of transportation said that while the government is happy to set standards for animals flying on planes, it does not stipulate a minimum amount of space for humans.


In [12]:
# Print ROUGE scores line by line
for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {scores.precision}")
    print(f"Recall: {scores.recall}")
    print(f"F1 Score: {scores.fmeasure}")
    print()


rouge1:
Precision: 0.38235294117647056
Recall: 0.23214285714285715
F1 Score: 0.28888888888888886

rouge2:
Precision: 0.09090909090909091
Recall: 0.05454545454545454
F1 Score: 0.06818181818181819

rougeL:
Precision: 0.2647058823529412
Recall: 0.16071428571428573
F1 Score: 0.19999999999999998

rougeLsum:
Precision: 0.35294117647058826
Recall: 0.21428571428571427
F1 Score: 0.26666666666666666



In [None]:
# на самом деле эта аннотация мне нравится больше чем предыдущая,
# хотя по скору как будто хуже, но при этом смысл плюс минус как в оригинале

## DistilBERT

In [13]:
from transformers import DistilBertTokenizer, DistilBertModel

In [15]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

In [42]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name)

def annotate_text_with_distilbert(input_text, num_sentences=3):
    # Tokenize the input text
    tokens = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Get the hidden states from the DistilBERT model
    with torch.no_grad():
        outputs = model(**tokens)
    
    # Extract the embeddings for each token
    embeddings = outputs.last_hidden_state
    
    # Calculate the importance scores for each token (e.g., sum of token embeddings along the sequence dimension)
    importance_scores = torch.sum(embeddings, dim=2).squeeze()
    
    # Calculate the sum of importance scores for each sentence
    sentence_importance = torch.sum(importance_scores, dim=0)
    
    if sentence_importance.dim() == 0:
        # If sentence_importance is a scalar, convert it to a tensor with one dimension
        sentence_importance = sentence_importance.unsqueeze(0)
    
    # Get the top N sentences based on the sum of importance scores
    top_indices = torch.topk(sentence_importance, k=min(num_sentences, sentence_importance.size(0)), dim=0).indices
    
    # Sort the indices to maintain order
    top_indices = sorted(top_indices.tolist())
    
    # Decode the tokens to get the original text
    decoded_text = tokenizer.decode(tokens['input_ids'][0], skip_special_tokens=True)
    
    # Split the text into sentences
    sentences = decoded_text.split('. ')
    
    # Extract the top N sentences from the decoded text
    annotated_text = '. '.join([sentences[i] for i in top_indices if i < len(sentences)])
    
    return annotated_text

In [45]:
# Evaluate the model on a sample article
sample_index = 0  # Change this to evaluate other articles
article = test_df.iloc[sample_index]['article']
original_summary = test_df.iloc[sample_index]['highlights']
generated_summary = annotate_text_with_distilbert(article)

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(original_summary, generated_summary)

# Print the results
print("Original Summary:")
print(original_summary)
print("\nGenerated Summary:")
print(generated_summary)

# Debugging output for generated_summary
# print("\nGenerated Summary (Debug Output):")
# print(repr(generated_summary))  # This will print the repr of the string, including special characters
# Check if generated_summary is empty
# if not generated_summary.strip():
#     print("\nWARNING: generated_summary is empty!")
# Print the original code for generated_summary
# print("\nGenerated Summary (Original):")
# print(generated_summary)


Original Summary:
Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .

Generated Summary:
ever noticed how plane seats appear to be getting smaller and smaller? with increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk


In [46]:
# Print ROUGE scores line by line
for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {scores.precision}")
    print(f"Recall: {scores.recall}")
    print(f"F1 Score: {scores.fmeasure}")
    print()


rouge1:
Precision: 0.35294117647058826
Recall: 0.3333333333333333
F1 Score: 0.34285714285714286

rouge2:
Precision: 0.15151515151515152
Recall: 0.14285714285714285
F1 Score: 0.14705882352941174

rougeL:
Precision: 0.2647058823529412
Recall: 0.25
F1 Score: 0.2571428571428572

rougeLsum:
Precision: 0.3235294117647059
Recall: 0.3055555555555556
F1 Score: 0.31428571428571433

