In [None]:
import zipfile
import json
import io
import os

In [None]:
def read_json_from_folder(folder_path):
    data = {}
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in json_files:
        file_path = os.path.join(folder_path, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data[json_file] = json.load(f)
    
    return data


In [None]:
dataset_path = "/kaggle/input/amasum/min_10_max_100_revs_filt_complete"

train_path = os.path.join(dataset_path, "train")
test_path = os.path.join(dataset_path, "test")
valid_path = os.path.join(dataset_path, "valid")


In [None]:
train_data = read_json_from_folder(train_path)
test_data = read_json_from_folder(test_path)
valid_data = read_json_from_folder(valid_path)

# Print a sample JSON file
sample_file = list(train_data.keys())[0]
#print(f"Sample file: {sample_file}\nData:\n", train_data[sample_file])

In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm

def prepare_dataset(folder_path):
    """Reads JSON files and prepares data for summarization, including pros and cons."""
    data_list = []
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in tqdm(json_files, desc=f"Processing {folder_path}"):
        file_path = os.path.join(folder_path, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            # Extract all customer reviews
            reviews = []
            if "customer_reviews" in data:
                for review in data["customer_reviews"]:
                    reviews.append(review.get("text", ""))

            # Extract summary, pros, and cons
            if "website_summaries" in data:
                summary_data = data["website_summaries"][0]  # First summary
                summary = summary_data.get("verdict", "")
                pros = ", ".join(summary_data.get("pros", []))  # Convert list to string
                cons = ", ".join(summary_data.get("cons", []))  # Convert list to string

                if reviews and summary:
                    combined_reviews = " ".join(reviews)  # Merge all reviews
                    data_list.append({
                        "reviews": combined_reviews,
                        "pros": pros,
                        "cons": cons,
                        "summary": summary
                    })

    return pd.DataFrame(data_list)

# Define dataset paths
dataset_path = "/kaggle/input/amasum/min_10_max_100_revs_filt_complete"

# Prepare Train, Validation, and Test Data
train_df = prepare_dataset(os.path.join(dataset_path, "train"))
valid_df = prepare_dataset(os.path.join(dataset_path, "valid"))
test_df = prepare_dataset(os.path.join(dataset_path, "test"))

# Save to CSV for later use
train_df.to_csv("pegasus_train_data.csv", index=False)
valid_df.to_csv("pegasus_valid_data.csv", index=False)
test_df.to_csv("pegasus_test_data.csv", index=False)




Processing /kaggle/input/amasum/min_10_max_100_revs_filt_complete/train: 100%|██████████| 25203/25203 [03:05<00:00, 136.21it/s]
Processing /kaggle/input/amasum/min_10_max_100_revs_filt_complete/valid: 100%|██████████| 3114/3114 [00:22<00:00, 137.99it/s]
Processing /kaggle/input/amasum/min_10_max_100_revs_filt_complete/test: 100%|██████████| 3166/3166 [00:23<00:00, 132.01it/s]


In [3]:
# Display sample
print("Train Data Sample:")
print(train_df['reviews'][0])
print("\nValidation Data Sample:")
print(valid_df['summary'][0])
print("\nTest Data Sample:")
print(test_df['pros'][0])

Train Data Sample:
As long as this is still available, this is the absolute best phone for the money in my opinion. I researched seemingly forever for a low priced full-featured phone, and this fits the bill. The Windows O.S. is seamless and easy to customize and navigate. I just bought two more for my wife and daughter, who were always "borrowing" mine. I just have to chuckle when I see everyone swooning over the latest exhorbitantly priced Apple phones, and paying the equvalent of a car payment to do the same things this phone does. And my $30/ month plan is adequate for my business calling, email checking and file sending needs, which is all I need it for. Effective July 31 2017 Microsoft no longer supports software on this phone. It took two hours at At&t to get contacts from old phone to this one. Only buy if you have a way to transfer your contacts and are a very knowledgeable Windows user. Design is poor as edges of screen are widest part of phone and not Gorilla glass, so any d

In [8]:
import pandas as pd
a = pd.read_csv("pegasus_train_data.csv")


In [9]:
a.shape

(25203, 4)

In [4]:
import pandas as pd
from datasets import Dataset
from transformers import PegasusTokenizer
from tqdm import tqdm

# Load CSV data with controlled size
def load_data(file_path, num_samples):
    df = pd.read_csv(file_path)
    df = df.sample(n=num_samples, random_state=42).reset_index(drop=True)  # Shuffle and take subset
    df["target"] = df["pros"] + ". " + df["cons"]
    return Dataset.from_pandas(df)

# Load dataset paths
train_path = "pegasus_train_data.csv"
valid_path = "pegasus_valid_data.csv"

# Load specific number of samples
train_dataset = load_data(train_path, 10000)  # Take 10,000 training samples
valid_dataset = load_data(valid_path, 2500)   # Take 2,500 validation samples

# Load Pegasus tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["reviews"], padding="max_length", truncation=True, max_length=512, return_tensors="pt"
    )
    labels = tokenizer(
        examples["target"], padding="max_length", truncation=True, max_length=256, return_tensors="pt"
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenizing datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

# Save tokenized datasets to disk
train_dataset.save_to_disk("tokenized_train_dataset")
valid_dataset.save_to_disk("tokenized_valid_dataset")

# Print first sample to verify
print(train_dataset[0])


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

{'reviews': 'If you have to deal with numbers, you need this pad for your laptop. Simple to use, just plug it into your usb port and go. I take the time to review purchases because I know that I rely on them myself and I like to try to help others make informed decisions. If my review was helpful to you, please click the "Helpful" button. Descent external 9 key keyboard, however it does not have a "delete" button (don\'t get fooled by the "Back Space" button) which is why I really wanted the external keypad. It is a little thicker than expected. I returned it because it didn\'t have a delete button. This keypad replaced a Belkin wireless keypad, which required a battery. I found that the battery was usually "dead" when I wanted to use it because it would turn on when a key was pressed. This was handy when preparing to use it, but a problem when traveling with it in a suitcase! The Satechi keypad plugs into my computer, so there\'ll be no problem with dead batteries. I\'ve only used it 

In [11]:
train_dataset.shape
valid_dataset.shape

(2500, 8)

In [5]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is detected
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the GPU
print(torch.cuda.memory_allocated() / 1024**3, "GB Allocated")  # Memory used
print(torch.cuda.memory_reserved() / 1024**3, "GB Reserved")  # Memory reserved


True
2
Tesla T4
0.0 GB Allocated
0.0 GB Reserved


In [13]:
import torch
torch.cuda.empty_cache()


In [14]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
from datasets import load_from_disk
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Load tokenized datasets from disk
train_dataset = load_from_disk("tokenized_train_dataset")
valid_dataset = load_from_disk("tokenized_valid_dataset")

# Load model and tokenizer
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# Define data collator for padding dynamically
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./pegasus_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train and evaluate
trainer.train()

# Save final model
trainer.save_model("pegasus_finetuned")
tokenizer.save_pretrained("pegasus_finetuned")

print("Training & Validation complete! Model saved.")

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 5000/5000 [1:26:39<00:00,  1.04s/it, loss=0.797]


Epoch 1 Train Loss: 1.2051


Validating: 100%|██████████| 1250/1250 [06:32<00:00,  3.18it/s]


Epoch 1 Validation Loss: 0.8189


Epoch 2: 100%|██████████| 5000/5000 [1:26:51<00:00,  1.04s/it, loss=0.761]


Epoch 2 Train Loss: 0.8197


Validating: 100%|██████████| 1250/1250 [06:35<00:00,  3.16it/s]


Epoch 2 Validation Loss: 0.7989


Epoch 3: 100%|██████████| 5000/5000 [1:26:53<00:00,  1.04s/it, loss=0.949]


Epoch 3 Train Loss: 0.7884


Validating:  59%|█████▊    | 734/1250 [03:52<02:43,  3.16it/s]

In [7]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch

# Load trained model and tokenizer
model_name = "/kaggle/working/pegasus_finetuned"  # Path where you saved the model
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

In [17]:
import pandas as pd
from datasets import Dataset

# Load test dataset CSV
test_path = "pegasus_test_data.csv"  # Replace with your actual test file
df_test = pd.read_csv(test_path)

# Select 500 random samples for testing
df_test = df_test.sample(n=500, random_state=42).reset_index(drop=True)

# Convert to Hugging Face dataset format
test_dataset = Dataset.from_pandas(df_test)


In [18]:
test_dataset.shape

(500, 4)

In [19]:
def generate_summary(review_text):
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to GPU/CPU
    summary_ids = model.generate(**inputs, max_length=256, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [20]:
# Generate summaries for all test reviews
df_test["generated_summary"] = df_test["reviews"].apply(generate_summary)

# Save results
df_test.to_csv("pegasus_test_results.csv", index=False)

# Display a few examples
print(df_test[["reviews", "generated_summary"]].head())


                                             reviews  \
0  Planned to try use it at the small gym locker ...   
1  I'm not an expert on olive oil, and I wasn't h...   
2  We went on a family trip to the zoo with our 3...   
3  I originally purchased this cleanser from Seph...   
4  Bought this for my FIL to replace his that flo...   

                                   generated_summary  
0  Easy to set up and use, Works well as a luggag...  
1  This oil is made from 100% Spanish olive oil, ...  
2  Made from 100% polyester, This towel is lightw...  
3  Formulated to be gentle on sensitive skin, Gen...  
4  This mattress is made with a combination of me...  


In [21]:
print(df_test['summary'][0])
print(df_test['pros'][0])
print(df_test['cons'][0])
print(df_test['generated_summary'][0])

These locks are a little larger than others and hold up better.
Well-made locks, Work well on zippers and small lockers, Company is responsive if you have problems with the locks, Locks come in orange or black
Setting the combination can be a little difficult, Some people have a hard time reopening their locks
Easy to set up and use, Works well as a luggage lock, but can also be used as a door lock, Easy to open and close, Works well as a door lock, but can also be used as a luggage lock, Works well as a door lock, but can also be used as a door lock, Works well as a luggage lock, Works well as a door lock, but can also be used as a door lock, Works well as a luggage lock, Works well as a door lock, but can also be used as a door lock, Works well as a luggage lock, Works well as a door lock, but can also be used as a door lock, Works well as a luggage lock, Works well as a door lock, Works well as a luggage lock, Works well as a door lock, Works well as a door lock, Works well as a doo

In [22]:
pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5a9535675fec38475e6fcd888b0e8bfbbb521017e71914a743565c748df5d0e8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [23]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def compute_rouge_scores(target, prediction):
    scores = scorer.score(target, prediction)
    return scores

# Evaluate ROUGE on the test set
rouge_scores = df_test.apply(lambda row: compute_rouge_scores(row["summary"], row["generated_summary"]), axis=1)

# Extract average ROUGE scores
rouge_1 = sum([score["rouge1"].fmeasure for score in rouge_scores]) / len(rouge_scores)
rouge_2 = sum([score["rouge2"].fmeasure for score in rouge_scores]) / len(rouge_scores)
rouge_L = sum([score["rougeL"].fmeasure for score in rouge_scores]) / len(rouge_scores)

print(f"ROUGE-1: {rouge_1:.4f}, ROUGE-2: {rouge_2:.4f}, ROUGE-L: {rouge_L:.4f}")


ROUGE-1: 0.1580, ROUGE-2: 0.0199, ROUGE-L: 0.1153


In [15]:
mooc_df = pd.read_csv("/kaggle/input/mooc-dataset/mooc_coursera_dataset.csv")

In [16]:
mooc_df.shape

(45096, 6)

In [17]:
mooc_df = mooc_df.drop_duplicates()

In [18]:
mooc_df = mooc_df.dropna(subset=["reviews"])

In [19]:
mooc_df.shape

(17417, 6)

In [20]:
course_reviews_dict = mooc_df.groupby("course_title")["reviews"].apply(list).to_dict()
for key,value in course_reviews_dict.items():
  print(key,len(value))

Agile with Atlassian Jira 807
Become a CBRS Certified Professional Installer by Google 51
Building Scalable Java Microservices with Spring Boot and Spring Cloud 206
Business Metrics for Data-Driven Companies 273
Data Analysis with Python 1474
Data Science Methodology 1182
Databases and SQL for Data Science 587
Fundamentals of Project Planning and Management 1160
Google Cloud Platform Big Data and Machine Learning Fundamentals 397
Introduction to Data Science in Python 5052
Introduction to User Experience Design 889
Natural Language Processing in TensorFlow 707
Operating Systems and You: Becoming a Power User 2631
Programming Foundations with JavaScript, HTML and CSS 573
Site Reliability Engineering: Measuring and Managing Reliability 163
Supply Chain Principles 351
The Social Context of Mental Health and Illness 92
Visual Elements of User Interface Design 822


In [None]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import string

# Download English word list if not already present
nltk.download('punkt')
nltk.download('words')
english_words = set(words.words())

def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(str(text), "html.parser").get_text()

    # Normalize multiple punctuation marks (e.g., !!! becomes !)
    text = re.sub(r'([!?.]){2,}', r'\1', text)

    # Lowercase the text
    text = text.lower()

    # Remove URLs and hyperlinks
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation (except spaces)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Keep only valid English words
    tokens = [word for word in tokens if word in english_words]

    # Re-join tokens and remove extra spaces
    cleaned_text = ' '.join(tokens)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text


In [None]:
for key, values in course_reviews_dict.items():
    cleaned_reviews = [clean_text(review) for review in values]
    course_reviews_dict[key] = cleaned_reviews

In [21]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load Pegasus model and tokenizer
model_path = "/kaggle/working/pegasus_finetuned"  # Your fine-tuned model directory
tokenizer = PegasusTokenizer.from_pretrained(model_path)
model = PegasusForConditionalGeneration.from_pretrained(model_path)

# Move model to GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to GPU/CPU
    summary_ids = model.generate(
    **inputs,
    max_length=256,  # Limit summary length
    num_beams=8,  # Beam search for better summaries
    early_stopping=True,  # Stop once summary makes sense
    no_repeat_ngram_size=3,  # Prevent repeating phrases
    length_penalty=1.5,  # Encourage concise summaries
    repetition_penalty=1.5  # Reduce excessive repetitions
)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for all courses
summary_dict = {}
for key,values in course_reviews_dict.items():
    course_name = key
    original_text = course_reviews_dict[course_name]
    original_text = " ".join(original_text)
    gen_summ = generate_summary(original_text)
    summary_dict[course_name] = [gen_summ]
    print(course_name, "cmplt")

# Save the results
#summary_df = pd.DataFrame(list(course_summaries.items()), columns=["course_id", "generated_summary"])
#summary_df.to_csv("mooc_summaries.csv", index=False)

# Display some results



Agile with Atlassian Jira cmplt
Become a CBRS Certified Professional Installer by Google cmplt
Building Scalable Java Microservices with Spring Boot and Spring Cloud cmplt
Business Metrics for Data-Driven Companies cmplt
Data Analysis with Python cmplt
Data Science Methodology cmplt
Databases and SQL for Data Science cmplt
Fundamentals of Project Planning and Management cmplt
Google Cloud Platform Big Data and Machine Learning Fundamentals cmplt
Introduction to Data Science in Python cmplt
Introduction to User Experience Design cmplt
Natural Language Processing in TensorFlow cmplt
Operating Systems and You: Becoming a Power User cmplt
Programming Foundations with JavaScript, HTML and CSS cmplt
Site Reliability Engineering: Measuring and Managing Reliability cmplt
Supply Chain Principles cmplt
The Social Context of Mental Health and Illness cmplt
Visual Elements of User Interface Design cmplt


In [22]:
for key,values in summary_dict.items():
    print(key,":")
    print(values, "\n")

Agile with Atlassian Jira :
["This course is designed for those who want to learn more about how to use Jira, but don't want to spend a lot of time learning the ins and outs of the software. You'll need to pay for this course if you want to earn a certificate"] 

Become a CBRS Certified Professional Installer by Google :
["This course is designed to help you prepare for the Certified Professional in Internet Security (CPI) exam, It's easy to follow and covers all of the information you need to know to pass the exam. Some of the questions on the final exam were not included in the course"] 

Building Scalable Java Microservices with Spring Boot and Spring Cloud :
["This course is designed for those who want to learn more about Google's cloud platform and how it works, The course covers the basics of Google Cloud Platform (GCP), as well as how to use it in your day-to-day work. The course doesn't provide much in the way of real-world examples, and some of the labs don't work as expected"

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_semantic_similarity(original_text, generated_summary):
    """
    Evaluates semantic similarity between the original text and the generated summary.

    Args:
    - original_text (str): The original input text.
    - generated_summary (str): The generated summary.

    Returns:
    - float: Semantic similarity score (range 0 to 1).
    """
    # Load pre-trained sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for original text and summary
    embeddings = model.encode([original_text, generated_summary])

    # Compute cosine similarity
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity_score

# Example Usage

total_score = 0
for key,values in course_reviews_dict.items():
    
    original_text = " ".join(course_reviews_dict[key])
    generated_summary = " ".join(summary_dict[key])

# Evaluate semantic similarity
    similarity_score = evaluate_semantic_similarity(original_text, generated_summary)
    total_score += similarity_score
    print(f"Semantic Similarity Score of :{key} is {similarity_score:.2f}")
    
avg_score = total_score/len(course_reviews_dict)
print(f"Average Semantic Similarity Score: {avg_score:.2f}")


In [None]:
total_ratio_score = 0
for key,values in course_reviews_dict.items():
    original_text_len = " ".join(course_reviews_dict[key])
    summary_len = " ".join(summary_dict[key]) 
    print(key)
    print('Length of the Original Text:', len(original_text_len))

    print("Length of the Summarized Text:", len(summary_len))
    ratio = (100*(len(summary_len))/(len(original_text_len)))
    print("Summary Ratio:", ratio, "%")
    total_ratio_score += ratio
avg = total_ratio_score/len(course_reviews_dict)
print(avg, "%")    

In [8]:
model.save_pretrained("pegasus_finetuned")
tokenizer.save_pretrained("pegasus_finetuned")

('pegasus_finetuned/tokenizer_config.json',
 'pegasus_finetuned/special_tokens_map.json',
 'pegasus_finetuned/spiece.model',
 'pegasus_finetuned/added_tokens.json')

In [None]:
import shutil

# Zip the model directory
shutil.make_archive("pegasus_finetuned", 'zip', "pegasus_finetuned")

In [None]:
from IPython.display import FileLink

FileLink(r'pegasus_finetuned.zip')

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate_coherence(text):
    """
    Evaluates coherence by computing cosine similarity between consecutive sentence embeddings.

    Args:
    - text (str): The input text to evaluate.

    Returns:
    - float: Average coherence score (range 0 to 1).
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = text.split(". ")  # Simple sentence segmentation
    embeddings = model.encode(sentences)
    
    if len(embeddings) < 2:
        return 1.0  # If only one sentence, coherence is perfect
    
    similarities = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] 
                    for i in range(len(embeddings) - 1)]
    
    return np.mean(similarities)

# Example Usage
total_coherence = 0
for key, values in summary_dict.items():
    generated_summary = " ".join(values)
    coherence_score = evaluate_coherence(generated_summary)
    total_coherence += coherence_score
    print(f"Coherence Score for {key}: {coherence_score:.2f}")

avg_coherence = total_coherence / len(summary_dict)
print(f"Average Coherence Score: {avg_coherence:.2f}")
