# BART Model

In [3]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = 'facebook/bart-large-cnn'  # Example model
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


In [4]:
from transformers import pipeline

# Function to summarize text using BART
def summarize_bart(file_path):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Split the text into chunks of 1024 tokens (this is the maximum length for BART)
    chunks = [text[i:i + 1024] for i in range(0, len(text), 1024)]
    
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    
    # Combine the summaries
    combined_summary = ' '.join(summaries)
    
    return combined_summary

# Example usage:
file_path = '/Users/bhargavsrisainama/Documents/personal/CU Boulder Data Science/Data Mining/Data Mining Group Project/Raw Data/combined_text.txt'
print("BART Summary:", summarize_bart(file_path))

Your max_length is set to 150, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


BART Summary: The idea of a random variable can be a little bit confusing. Random variables are really ways to map outcomes of random processes to numbers. So if you have a random process, like you're flipping a coin or you're rolling dice or you are measuring the rain that might fall tomorrow, you're really just mapping outcomes of that. A coin flip can be defined as 1 if heads, 0 if tails. Capital Y is equal to the sum of rolls of let's say 7 dice. This is actually a fairly typical way of defining a random variable, especially for a coin flip. But I could have defined this as 100. And I can define this as 703. The probability that the sum of the upward faces after rolling seven dice is less than or equal to 30. g a random variable in that way. The natural question you might ask is, why are we doing this? What's so useful about defining random variables like this? It will become more apparent as we get a little bit deeper in probability. These are different than traditional variables,

# T5 Model


In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Function to summarize text using T5
def summarize_t5(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=180, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(outputs[0])
    return summary

file_path = '/Users/bhargavsrisainama/Documents/personal/CU Boulder Data Science/Data Mining/Data Mining Group Project/Raw Data/combined_text.txt'
# Example usage:
print("T5 Summary:", summarize_t5(file_path))


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5 Summary: <pad> random variables are really ways to map outcomes of random processes to numbers. so if you have a random process, like you're flipping a coin or you're rolling dice, so random process, you're really just mapping outcomes of that to numbers. if you have a random process, like you're flipping a coin or you're rolling dice, so random process, you're really just mapping outcomes of that to numbers.</s>


# Pegasus Model

In [6]:
def summarize_pegasus(file_path, max_summary_length=300):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Split text into smaller chunks
    chunk_size = 1024  # or any desired chunk size
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Generate summary for each chunk
    summaries = []
    for chunk in chunks:
        inputs = tokenizer.encode(chunk, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=max_summary_length, num_beams=4, length_penalty=1.5, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0])
        summaries.append(summary)

    return '\n'.join(summaries)

file_path = '/Users/bhargavsrisainama/Documents/personal/CU Boulder Data Science/Data Mining/Data Mining Group Project/Raw Data/combined_text.txt'
# Example usage:
print("PEGASUS Summary:", summarize_pegasus(file_path, max_summary_length=500))


PEGASUS Summary: <pad> <extra_id_0> in this video is the idea of a random variable. <extra_id_1> is the idea of a random variable. <extra_id_2> capital X, I will define it as-- It is going to be equal to 1 if my fair die rolls heads-- let me write it this way-- if tails. And it's going to be equal to 0 if tails. And it's going to be equal to 0 if tails. And it's going to be equal to 0 if tails.</s>
<pad> <extra_id_0> the sum of the upward face after rolling 7 dice. And then we are quantifying an outcome for a random process where the random process is rolling these 7 dice and seeing what sides show up on top. And then we are definining an outcome for a random process where the random process is rolling these 7 dice and seeing what sides show up on top. And then we're taking those and we're taking the sum and we are definining an outcome for a random process.</s>
<pad> <extra_id_0> g a random variable in that way. And you can start to do a little bit more math on the outcomes. And you c

# BERT Model

In [18]:
from transformers import BertModel, BertTokenizer
from transformers import BertLMHeadModel, BertTokenizer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# model = BertLMHeadModel.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def bert_extractive_summarize(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    sentences = text.split('. ')
    sentence_embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model(**inputs)
        sentence_embeddings.append(outputs.last_hidden_state.mean(1).detach().numpy())

    similarities = cosine_similarity(np.vstack(sentence_embeddings))
    avg_similarities = similarities.mean(axis=0)
    top_idx = np.argsort(avg_similarities)[-5:]  # Adjust number of sentences as needed
    summary = ' '.join([sentences[i] for i in top_idx])
    return summary

# def bert_extractive_summarize(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         text = file.read()

#     sentences = text.split('. ')
#     sentence_embeddings = []
#     for sentence in sentences:
#         inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
#         outputs = model(**inputs)
#         # Retrieve the hidden states from the last layer
#         last_hidden_states = outputs.last_hidden_state
#         # Calculate the mean of the hidden states for each sentence
#         sentence_embedding = last_hidden_states.mean(dim=1).detach().numpy()
#         sentence_embeddings.append(sentence_embedding)

#     similarities = cosine_similarity(np.vstack(sentence_embeddings))
#     avg_similarities = similarities.mean(axis=0)
#     top_idx = np.argsort(avg_similarities)[-5:]  # Adjust number of sentences as needed
#     summary = ' '.join([sentences[i] for i in top_idx])
#     return summary


file_path = '/Users/bhargavsrisainama/Documents/personal/CU Boulder Data Science/Data Mining/Data Mining Group Project/Raw Data/combined_text.txt'
# Example usage:
print("BERT Extractive Summary:", bert_extractive_summarize(file_path))


BERT Extractive Summary: Once again, we are quantifying an outcome for a random process where the random process is rolling these 7 dice and seeing what sides show up on top We can define another random variable capital Y as equal to, let's say, the sum of rolls of let's say 7 dice But now we can just write the probability that capital Y is less than or equal to 30 Notice we have taken this random process, flipping a coin, and we've mapped the outcomes of that random process And if someone else cares about the probability that this sum of the upward face after rolling seven dice-- if they say, hey, what's the probability that that's even, instead of having to write all that over, they can say, well, what's the probability that Y is even? Now the one thing that I do want to emphasize is how these are different than traditional variables, traditional variables that you see in your algebra class like x plus 5 is equal to 6, usually denoted by lowercase variables


# LED Model

In [21]:
from transformers import LEDTokenizer, LEDForConditionalGeneration

# Load LED tokenizer and model
tokenizer = LEDTokenizer.from_pretrained('allenai/led-large-16384-arxiv')
model = LEDForConditionalGeneration.from_pretrained('allenai/led-large-16384-arxiv')

# Function to summarize text from a file using LED
def summarize_led_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


# Example usage:
file_path = '/Users/bhargavsrisainama/Documents/personal/CU Boulder Data Science/Data Mining/Data Mining Group Project/Raw Data/combined_text.txt'
print("LED Summary:", summarize_led_from_file(file_path))


LED Summary:  in this video, we discuss the idea of a random variable and random variables in probability. 
 random variables are really ways to map outcomes of random processes to numbers. So if you have a random process, like you have a random process, like you're flipping a coin or you're rolling dice or you are measuring the rain that might fall tomorrow, so random process, you are really just mapping outcomes of that to numbers. 
 random process, you are really just mapping outcomes of that to numbers. 
 random process, you are really just mapping outcomes of that to numbers. 
 random variables are really ways to map outcomes of random processes to numbers. 


# TFIDF Model

In [9]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Load NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to generate summary
def generate_summary(text, num_sentences):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    # Remove stopwords and tokenize words
    stop_words = set(stopwords.words("english"))
    word_tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
    filtered_words = [[word for word in words if word not in stop_words] for words in word_tokens]

    # Flatten the list of filtered words
    flat_words = [' '.join(words) for words in filtered_words]

    # Calculate TF-IDF scores for each word
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(flat_words)

    # Calculate TF-IDF scores for each sentence
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_score = 0
        for word in word_tokenize(sentence.lower()):
            if word not in stop_words:
                word_index = tfidf_vectorizer.vocabulary_.get(word)
                if word_index is not None:
                    sentence_score += tfidf_matrix[i, word_index]
        sentence_scores[i] = sentence_score

    # Select the top N most important sentences as the summary
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    top_sentences = sorted_sentences[:num_sentences]
    summary = ' '.join([sentences[i] for i in sorted(top_sentences)])
    
    return summary

# Input text
input_text = """
    What I want to discuss a little bit in this video is the idea of a random variable. And random variables at first can be a little bit confusing because we will want to think of them as traditional variables that you were first exposed to in algebra class. And that's not quite what random variables are. Random variables are really ways to map outcomes of random processes to numbers. So if you have a random process, like you're flipping a coin or you're rolling dice or you are measuring the rain that might fall tomorrow, so random process, you're really just mapping outcomes of that to numbers. You are quantifying the outcomes. So what's an example of a random variable? Well, let's define one right over here. So I'm going to define random variable capital X. And they tend to be denoted by capital letters. So random variable capital X, I will define it as-- It is going to be equal to 1 if my fair die rolls heads-- let me write it this way-- if heads. And it's going to be equal to 0 if tails. I could have defined this any way I wanted to. This is actually a fairly typical way of defining a random variable, especially for a coin flip. But I could have defined this as 100. And I could have defined this as 703. And this would still be a legitimate random variable. It might not be as pure a way of thinking about it as defining 1 as heads and 0 as tails. But that would have been a random variable. Notice we have taken this random process, flipping a coin, and we've mapped the outcomes of that random process. And we've quantified them. 1 if heads, 0 if tails. We can define another random variable capital Y as equal to, let's say, the sum of rolls of let's say 7 dice. And when we talk about the sum, we're talking about the sum of the 7-- let me write this-- the sum of the upward face after rolling 7 dice. Once again, we are quantifying an outcome for a random process where the random process is rolling these 7 dice and seeing what sides show up on top. And then we are taking those and we're taking the sum and we are defining a random variable in that way. So the natural question you might ask is, why are we doing this? What's so useful about defining random variables like this? It will become more apparent as we get a little bit deeper in probability. But the simple way of thinking about it is as soon as you quantify outcomes, you can start to do a little bit more math on the outcomes. And you can start to use a little bit more mathematical notation on the outcome. So for example, if you cared about the probability that the sum of the upward faces after rolling seven dice-- if you cared about the probability that that sum is less than or equal to 30, the old way that you would have to have written it is the probability that the sum of-- and you would have to write all of what I just wrote here-- is less than or equal to 30. You would have had to write that big thing. And then you would try to figure it out somehow if you had some information. But now we can just write the probability that capital Y is less than or equal to 30. It's a little bit cleaner notation. And if someone else cares about the probability that this sum of the upward face after rolling seven dice-- if they say, hey, what's the probability that that's even, instead of having to write all that over, they can say, well, what's the probability that Y is even? Now the one thing that I do want to emphasize is how these are different than traditional variables, traditional variables that you see in your algebra class like x plus 5 is equal to 6, usually denoted by lowercase variables. y is equal to x plus 7. These variables, you can essentially assign values. You either can solve for them-- so in this case, x is an unknown. You could subtract 5 from both sides and solve for x. Say that x is going to be equal to 1. In this case, you could say, well, x is going to vary. We can assign a value to x and see how y varies as a function of x. You can either assign a variable, you can assign values to them. Or you can solve for them. You could say, hey x is going to be 1 in this case. That's not going to be the case with a random variable. A random variable can take on many, many, many, many, many, many different values with different probabilities. And it makes much more sense to talk about the probability of a random variable equaling a value, or the probability that it is less than or greater than something, or the probability that it has some property. And you see that in either of these cases. In the next video, we'll continue this discussion and we'll talk a little bit about the types of random variables you can have.
    """

# Generate summary
summary = generate_summary(input_text, 3)  # Generate a summary of 3 sentences
print("Generated Summary:")
print(summary)


Generated Summary:
So for example, if you cared about the probability that the sum of the upward faces after rolling seven dice-- if you cared about the probability that that sum is less than or equal to 30, the old way that you would have to have written it is the probability that the sum of-- and you would have to write all of what I just wrote here-- is less than or equal to 30. And if someone else cares about the probability that this sum of the upward face after rolling seven dice-- if they say, hey, what's the probability that that's even, instead of having to write all that over, they can say, well, what's the probability that Y is even? A random variable can take on many, many, many, many, many, many different values with different probabilities.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bhargavsrisainama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhargavsrisainama/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Load NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to generate summary
def generate_summary(text, num_sentences):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)

    # Remove stopwords and tokenize words
    stop_words = set(stopwords.words("english"))
    word_tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
    filtered_words = [[word for word in words if word not in stop_words] for words in word_tokens]

    # Flatten the list of filtered words
    flat_words = [' '.join(words) for words in filtered_words]

    # Calculate TF-IDF scores for each word
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(flat_words)

    # Calculate TF-IDF scores for each sentence
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_score = 0
        for word in word_tokenize(sentence.lower()):
            if word not in stop_words:
                word_index = tfidf_vectorizer.vocabulary_.get(word)
                if word_index is not None:
                    sentence_score += tfidf_matrix[i, word_index]
        sentence_scores[i] = sentence_score

    # Select the top N most important sentences as the summary
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    
    # Find sentences containing important keywords
    summary_sentences = []
    for i in sorted_sentences:
        summary_sentences.append(sentences[i])
        if len(summary_sentences) == num_sentences:
            break

    summary = ' '.join(summary_sentences)
    
    return summary

# Input text
input_text = """
    What I want to discuss a little bit in this video is the idea of a random variable. And random variables at first can be a little bit confusing because we will want to think of them as traditional variables that you were first exposed to in algebra class. And that's not quite what random variables are. Random variables are really ways to map outcomes of random processes to numbers. So if you have a random process, like you're flipping a coin or you're rolling dice or you are measuring the rain that might fall tomorrow, so random process, you're really just mapping outcomes of that to numbers. You are quantifying the outcomes. So what's an example of a random variable? Well, let's define one right over here. So I'm going to define random variable capital X. And they tend to be denoted by capital letters. So random variable capital X, I will define it as-- It is going to be equal to 1 if my fair die rolls heads-- let me write it this way-- if heads. And it's going to be equal to 0 if tails. I could have defined this any way I wanted to. This is actually a fairly typical way of defining a random variable, especially for a coin flip. But I could have defined this as 100. And I could have defined this as 703. And this would still be a legitimate random variable. It might not be as pure a way of thinking about it as defining 1 as heads and 0 as tails. But that would have been a random variable. Notice we have taken this random process, flipping a coin, and we've mapped the outcomes of that random process. And we've quantified them. 1 if heads, 0 if tails. We can define another random variable capital Y as equal to, let's say, the sum of rolls of let's say 7 dice. And when we talk about the sum, we're talking about the sum of the 7-- let me write this-- the sum of the upward face after rolling 7 dice. Once again, we are quantifying an outcome for a random process where the random process is rolling these 7 dice and seeing what sides show up on top. And then we are taking those and we're taking the sum and we are defining a random variable in that way. So the natural question you might ask is, why are we doing this? What's so useful about defining random variables like this? It will become more apparent as we get a little bit deeper in probability. But the simple way of thinking about it is as soon as you quantify outcomes, you can start to do a little bit more math on the outcomes. And you can start to use a little bit more mathematical notation on the outcome. So for example, if you cared about the probability that the sum of the upward faces after rolling seven dice-- if you cared about the probability that that sum is less than or equal to 30, the old way that you would have to have written it is the probability that the sum of-- and you would have to write all of what I just wrote here-- is less than or equal to 30. You would have had to write that big thing. And then you would try to figure it out somehow if you had some information. But now we can just write the probability that capital Y is less than or equal to 30. It's a little bit cleaner notation. And if someone else cares about the probability that this sum of the upward face after rolling seven dice-- if they say, hey, what's the probability that that's even, instead of having to write all that over, they can say, well, what's the probability that Y is even? Now the one thing that I do want to emphasize is how these are different than traditional variables, traditional variables that you see in your algebra class like x plus 5 is equal to 6, usually denoted by lowercase variables. y is equal to x plus 7. These variables, you can essentially assign values. You either can solve for them-- so in this case, x is an unknown. You could subtract 5 from both sides and solve for x. Say that x is going to be equal to 1. In this case, you could say, well, x is going to vary. We can assign a value to x and see how y varies as a function of x. You can either assign a variable, you can assign values to them. Or you can solve for them. You could say, hey x is going to be 1 in this case. That's not going to be the case with a random variable. A random variable can take on many, many, many, many, many, many different values with different probabilities. And it makes much more sense to talk about the probability of a random variable equaling a value, or the probability that it is less than or greater than something, or the probability that it has some property. And you see that in either of these cases. In the next video, we'll continue this discussion and we'll talk a little bit about the types of random variables you can have.
    """

# Generate summary
reference_summary = generate_summary(input_text, 5)  # Generate a summary of 3 sentences
print("Generated Summary:")
print(reference_summary)

Generated Summary:
So for example, if you cared about the probability that the sum of the upward faces after rolling seven dice-- if you cared about the probability that that sum is less than or equal to 30, the old way that you would have to have written it is the probability that the sum of-- and you would have to write all of what I just wrote here-- is less than or equal to 30. A random variable can take on many, many, many, many, many, many different values with different probabilities. And if someone else cares about the probability that this sum of the upward face after rolling seven dice-- if they say, hey, what's the probability that that's even, instead of having to write all that over, they can say, well, what's the probability that Y is even? Now the one thing that I do want to emphasize is how these are different than traditional variables, traditional variables that you see in your algebra class like x plus 5 is equal to 6, usually denoted by lowercase variables. And it m

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bhargavsrisainama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhargavsrisainama/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Evaluation Metrics

In [11]:
reference_summary

"So for example, if you cared about the probability that the sum of the upward faces after rolling seven dice-- if you cared about the probability that that sum is less than or equal to 30, the old way that you would have to have written it is the probability that the sum of-- and you would have to write all of what I just wrote here-- is less than or equal to 30. A random variable can take on many, many, many, many, many, many different values with different probabilities. And if someone else cares about the probability that this sum of the upward face after rolling seven dice-- if they say, hey, what's the probability that that's even, instead of having to write all that over, they can say, well, what's the probability that Y is even? Now the one thing that I do want to emphasize is how these are different than traditional variables, traditional variables that you see in your algebra class like x plus 5 is equal to 6, usually denoted by lowercase variables. And it makes much more sen

In [12]:
#create an emtpy table to store the rogue and bleu scores
import pandas as pd
evaluation_metrics = pd.DataFrame(columns=["Model_Name" , 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Bleu'])

In [13]:
#use rogue score and bleu score to compare summarize_bart() and reference_summary
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(summarize_bart(file_path), reference_summary)
print("ROUGE Scores:", scores)

# Calculate BLEU score
reference = word_tokenize(reference_summary)
candidate = word_tokenize(summarize_bart(file_path))
bleu_score = sentence_bleu([reference], candidate)
print("BLEU Score:", bleu_score)

#append the table with this data
evaluation_metrics = evaluation_metrics.append({'Model_Name': 'BART', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)

Your max_length is set to 150, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


ROUGE Scores: {'rouge1': Score(precision=0.5909090909090909, recall=0.46099290780141844, fmeasure=0.5179282868525896), 'rouge2': Score(precision=0.3150684931506849, recall=0.24555160142348753, fmeasure=0.27599999999999997), 'rougeL': Score(precision=0.34545454545454546, recall=0.2695035460992908, fmeasure=0.30278884462151395)}


Your max_length is set to 150, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


BLEU Score: 0.23147433114647056


  evaluation_metrics = evaluation_metrics.append({'Model_Name': 'BART', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


In [14]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(summarize_t5(file_path), reference_summary)
print("ROUGE Scores:", scores)

# Calculate BLEU score
reference = word_tokenize(reference_summary)
candidate = word_tokenize(summarize_t5(file_path))
bleu_score = sentence_bleu([reference], candidate)
print("BLEU Score:", bleu_score)

#append the table with this data
evaluation_metrics = evaluation_metrics.append({'Model_Name': 'T5', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


Input ids are automatically padded from 512 to 1024 to be a multiple of `config.attention_window`: 1024


ROUGE Scores: {'rouge1': Score(precision=0.07727272727272727, recall=0.2328767123287671, fmeasure=0.11604095563139931), 'rouge2': Score(precision=0.0228310502283105, recall=0.06944444444444445, fmeasure=0.034364261168384876), 'rougeL': Score(precision=0.05454545454545454, recall=0.1643835616438356, fmeasure=0.08191126279863482)}
BLEU Score: 0.00506970223875006


  evaluation_metrics = evaluation_metrics.append({'Model_Name': 'T5', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


In [15]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(summarize_pegasus(file_path), reference_summary)
print("ROUGE Scores:", scores)

# Calculate BLEU score
reference = word_tokenize(reference_summary)
candidate = word_tokenize(summarize_pegasus(file_path))
bleu_score = sentence_bleu([reference], candidate)
print("BLEU Score:", bleu_score)

#append the table with this data
evaluation_metrics = evaluation_metrics.append({'Model_Name': 'PEGASUS', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


Input ids are automatically padded from 224 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 235 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 223 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 240 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 120 to 1024 to be a multiple of `config.attention_window`: 1024


ROUGE Scores: {'rouge1': Score(precision=0.8954545454545455, recall=0.2846820809248555, fmeasure=0.43201754385964913), 'rouge2': Score(precision=0.7397260273972602, recall=0.23444283646888567, fmeasure=0.35604395604395606), 'rougeL': Score(precision=0.6045454545454545, recall=0.19219653179190752, fmeasure=0.29166666666666663)}
BLEU Score: 0.2242114004384734


  evaluation_metrics = evaluation_metrics.append({'Model_Name': 'PEGASUS', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


In [19]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(bert_extractive_summarize(file_path), reference_summary)
print("ROUGE Scores:", scores)

# Calculate BLEU score
reference = word_tokenize(reference_summary)
candidate = word_tokenize(bert_extractive_summarize(file_path))
bleu_score = sentence_bleu([reference], candidate)
print("BLEU Score:", bleu_score)

#append the table with this data
evaluation_metrics = evaluation_metrics.append({'Model_Name': 'BERT', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


ROUGE Scores: {'rouge1': Score(precision=0.5818181818181818, recall=0.7071823204419889, fmeasure=0.6384039900249376), 'rouge2': Score(precision=0.4703196347031963, recall=0.5722222222222222, fmeasure=0.5162907268170426), 'rougeL': Score(precision=0.509090909090909, recall=0.6187845303867403, fmeasure=0.5586034912718204)}
BLEU Score: 0.4582188690891725


  evaluation_metrics = evaluation_metrics.append({'Model_Name': 'BERT', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


In [23]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(summarize_led_from_file(file_path), reference_summary)
print("ROUGE Scores:", scores)

# Calculate BLEU score
reference = word_tokenize(reference_summary)
candidate = word_tokenize(summarize_led_from_file(file_path))
bleu_score = sentence_bleu([reference], candidate)
print("BLEU Score:", bleu_score)

#append the table with this data
evaluation_metrics = evaluation_metrics.append({'Model_Name': 'LED', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


ROUGE Scores: {'rouge1': Score(precision=0.22272727272727272, recall=0.4336283185840708, fmeasure=0.2942942942942943), 'rouge2': Score(precision=0.0319634703196347, recall=0.0625, fmeasure=0.04229607250755287), 'rougeL': Score(precision=0.11363636363636363, recall=0.22123893805309736, fmeasure=0.15015015015015015)}
BLEU Score: 0.014383476368939092


  evaluation_metrics = evaluation_metrics.append({'Model_Name': 'LED', 'Rouge-1': scores['rouge1'].fmeasure, 'Rouge-2': scores['rouge2'].fmeasure, 'Rouge-L': scores['rougeL'].fmeasure, 'Bleu': bleu_score}, ignore_index=True)


In [24]:
evaluation_metrics

Unnamed: 0,Model_Name,Rouge-1,Rouge-2,Rouge-L,Bleu
0,BART,0.517928,0.276,0.302789,0.231474
1,T5,0.116041,0.034364,0.081911,0.00507
2,PEGASUS,0.432018,0.356044,0.291667,0.224211
3,BERT,0.638404,0.516291,0.558603,0.458219
4,LED,0.294294,0.042296,0.15015,0.014383
