In [1]:
import zipfile
import json
import io
import os

In [2]:
def read_json_from_folder(folder_path):
    data = {}
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in json_files:
        file_path = os.path.join(folder_path, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data[json_file] = json.load(f)
    
    return data


In [3]:
dataset_path = "/kaggle/input/amasum/min_10_max_100_revs_filt_complete"

train_path = os.path.join(dataset_path, "train")
test_path = os.path.join(dataset_path, "test")
valid_path = os.path.join(dataset_path, "valid")


In [4]:
train_data = read_json_from_folder(train_path)
test_data = read_json_from_folder(test_path)
valid_data = read_json_from_folder(valid_path)

# Print a sample JSON file
sample_file = list(train_data.keys())[0]
print(f"Sample file: {sample_file}\nData:\n", train_data[sample_file])

Sample file: B00LBFFSNM.json
Data:
 {'website_summaries': [{'verdict': "With its fun design and cheap price, the Lumia 635 is a decent option to consider if you're after 4G LTE on a budget. It's only marginally more expensive than its near-identical 3G-only Lumia 630 sibling, so you should certainly opt for the 635. If you want a better selection of apps however, the Motorola Moto G is still the 4G phone to go for.", 'pros': ['The Nokia Lumia 635 has an affordable price, its interchangeable cases are colourful and attractive and it has 4G LTE'], 'cons': ["Adding 4G LTE has meant the battery life has taken a hit, its screen resolution is unimpressive and its camera isn't really up to anything more than the odd Instagram snap"], 'aspects': {'Design': 7.0, 'Features': 7.0, 'Performance': 5.0}, 'publication_date': 20140807, 'rating': 7.5, 'source': 'cnet'}], 'customer_reviews': [{'title': 'this is the absolute best phone for the money in my opinion', 'text': 'As long as this is still avail

In [6]:
import os
import json
import pandas as pd
from tqdm import tqdm

def prepare_dataset(folder_path):
    """Reads JSON files and prepares data for summarization, including pros and cons."""
    data_list = []
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in tqdm(json_files, desc=f"Processing {folder_path}"):
        file_path = os.path.join(folder_path, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            # Extract all customer reviews
            reviews = []
            if "customer_reviews" in data:
                for review in data["customer_reviews"]:
                    reviews.append(review.get("text", ""))

            # Extract summary, pros, and cons
            if "website_summaries" in data:
                summary_data = data["website_summaries"][0]  # First summary
                summary = summary_data.get("verdict", "")
                pros = ", ".join(summary_data.get("pros", []))  # Convert list to string
                cons = ", ".join(summary_data.get("cons", []))  # Convert list to string

                if reviews and summary:
                    combined_reviews = " ".join(reviews)  # Merge all reviews
                    data_list.append({
                        "reviews": combined_reviews,
                        "pros": pros,
                        "cons": cons,
                        "summary": summary
                    })

    return pd.DataFrame(data_list)

# Define dataset paths
dataset_path = "/kaggle/input/amasum/min_10_max_100_revs_filt_complete"

# Prepare Train, Validation, and Test Data
train_df = prepare_dataset(os.path.join(dataset_path, "train"))
valid_df = prepare_dataset(os.path.join(dataset_path, "valid"))
test_df = prepare_dataset(os.path.join(dataset_path, "test"))

# Save to CSV for later use
train_df.to_csv("distilbart_train_data.csv", index=False)
valid_df.to_csv("distilbart_valid_data.csv", index=False)
test_df.to_csv("distilbart_test_data.csv", index=False)




Processing /kaggle/input/amasum/min_10_max_100_revs_filt_complete/train: 100%|██████████| 25203/25203 [00:24<00:00, 1030.31it/s]
Processing /kaggle/input/amasum/min_10_max_100_revs_filt_complete/valid: 100%|██████████| 3114/3114 [00:02<00:00, 1042.20it/s]
Processing /kaggle/input/amasum/min_10_max_100_revs_filt_complete/test: 100%|██████████| 3166/3166 [00:02<00:00, 1214.41it/s]


In [7]:
# Display sample
print("Train Data Sample:")
print(train_df['reviews'][0])
print("\nValidation Data Sample:")
print(valid_df['summary'][0])
print("\nTest Data Sample:")
print(test_df['pros'][0])
print(test_df['cons'][0])

Train Data Sample:
As long as this is still available, this is the absolute best phone for the money in my opinion. I researched seemingly forever for a low priced full-featured phone, and this fits the bill. The Windows O.S. is seamless and easy to customize and navigate. I just bought two more for my wife and daughter, who were always "borrowing" mine. I just have to chuckle when I see everyone swooning over the latest exhorbitantly priced Apple phones, and paying the equvalent of a car payment to do the same things this phone does. And my $30/ month plan is adequate for my business calling, email checking and file sending needs, which is all I need it for. Effective July 31 2017 Microsoft no longer supports software on this phone. It took two hours at At&t to get contacts from old phone to this one. Only buy if you have a way to transfer your contacts and are a very knowledgeable Windows user. Design is poor as edges of screen are widest part of phone and not Gorilla glass, so any d

In [8]:
import pandas as pd
a = pd.read_csv("distilbart_train_data.csv")


In [9]:
a.shape

(25203, 4)

In [8]:
import pandas as pd
from datasets import Dataset
from transformers import BartTokenizer
from tqdm import tqdm

# Load CSV data with controlled size
def load_data(file_path, num_samples):
    df = pd.read_csv(file_path)
    df = df.sample(n=num_samples, random_state=42).reset_index(drop=True)  # Shuffle and take subset
    df["target"] = df["pros"] + ". " + df["cons"]
    return Dataset.from_pandas(df)

# Load dataset paths
train_path = "distilbart_train_data.csv"
valid_path = "distilbart_valid_data.csv"
test_path = "distilbart_test_data.csv"

# Load specific number of samples
train_dataset = load_data(train_path, 10000)  # Take 10,000 training samples
valid_dataset = load_data(valid_path, 2500)   # Take 2,500 validation samples
test_dataset = load_data(test_path, 1000)

# Load DistilBART tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = BartTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["reviews"], padding="max_length", truncation=True, max_length=512, return_tensors="pt"
    )
    labels = tokenizer(
        examples["target"], padding="max_length", truncation=True, max_length=256, return_tensors="pt"
    )["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

# Tokenizing datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
# Save tokenized datasets to disk
train_dataset.save_to_disk("tokenized_train_dataset")
valid_dataset.save_to_disk("tokenized_valid_dataset")
test_dataset.save_to_disk("tokenized_test_dataset")

# Print first sample to verify
print(train_dataset[0])


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

{'reviews': 'If you have to deal with numbers, you need this pad for your laptop. Simple to use, just plug it into your usb port and go. I take the time to review purchases because I know that I rely on them myself and I like to try to help others make informed decisions. If my review was helpful to you, please click the "Helpful" button. Descent external 9 key keyboard, however it does not have a "delete" button (don\'t get fooled by the "Back Space" button) which is why I really wanted the external keypad. It is a little thicker than expected. I returned it because it didn\'t have a delete button. This keypad replaced a Belkin wireless keypad, which required a battery. I found that the battery was usually "dead" when I wanted to use it because it would turn on when a key was pressed. This was handy when preparing to use it, but a problem when traveling with it in a suitcase! The Satechi keypad plugs into my computer, so there\'ll be no problem with dead batteries. I\'ve only used it 

In [11]:
from datasets import load_from_disk
test_dataset = load_from_disk("test_dataset")
print(test_dataset['target'][0])

FileNotFoundError: Directory test_dataset not found

In [39]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is detected
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the GPU
print(torch.cuda.memory_allocated() / 1024**3, "GB Allocated")  # Memory used
print(torch.cuda.memory_reserved() / 1024**3, "GB Reserved")  # Memory reserved


True
2
Tesla T4
14.562282085418701 GB Allocated
14.595703125 GB Reserved


In [41]:
import torch
torch.cuda.empty_cache()


In [42]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [17]:
from datasets import load_from_disk
from torch.utils.data import DataLoader
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW, get_scheduler
from tqdm import tqdm

# Load tokenized datasets from disk
train_dataset = load_from_disk("tokenized_train_dataset")
valid_dataset = load_from_disk("tokenized_valid_dataset")

# Convert datasets to PyTorch format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Convert to PyTorch DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=8)

# Load DistilBART model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 3  # Adjusted for 5 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3  # Set to 5 epochs
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    loop = tqdm(train_dataloader, leave=True)

    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move tensors to GPU/CPU
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}/{num_epochs}")
        loop.set_postfix(loss=loss.item())

    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}/{num_epochs}"):
            batch = {k: v.to(device) for k, v in batch.items()}  # Move to CPU
            outputs = model(**batch)
            valid_loss += outputs.loss.item()

    avg_valid_loss = valid_loss / len(valid_dataloader)
    print(f"Epoch {epoch+1} Validation Loss: {avg_valid_loss:.4f}")

# Save the trained model and tokenizer
model.save_pretrained("distilbart_finetuned")
tokenizer.save_pretrained("distilbart_finetuned")

print("Training & Validation complete! Model saved.")


Epoch 1/3: 100%|██████████| 1250/1250 [41:15<00:00,  1.98s/it, loss=0.825]


Epoch 1 Train Loss: 1.0517


Validating Epoch 1/3: 100%|██████████| 313/313 [03:16<00:00,  1.59it/s]


Epoch 1 Validation Loss: 0.8869


Epoch 2/3: 100%|██████████| 1250/1250 [41:16<00:00,  1.98s/it, loss=0.755]


Epoch 2 Train Loss: 0.8284


Validating Epoch 2/3: 100%|██████████| 313/313 [03:16<00:00,  1.59it/s]


Epoch 2 Validation Loss: 0.8575


Epoch 3/3: 100%|██████████| 1250/1250 [41:14<00:00,  1.98s/it, loss=0.674]


Epoch 3 Train Loss: 0.7402


Validating Epoch 3/3: 100%|██████████| 313/313 [03:17<00:00,  1.59it/s]


Epoch 3 Validation Loss: 0.8554
Training & Validation complete! Model saved.


In [18]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Load trained model and tokenizer
model_name = "/kaggle/working/distilbart_finetuned"  # Path where you saved the model
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")




Model and tokenizer loaded successfully!


In [19]:
print(test_dataset['target'][0])

Well-made locks, Work well on zippers and small lockers, Company is responsive if you have problems with the locks, Locks come in orange or black. Setting the combination can be a little difficult, Some people have a hard time reopening their locks


In [20]:
pip install rouge-score

Note: you may need to restart the kernel to use updated packages.


In [21]:
from datasets import load_from_disk
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from tqdm import tqdm
from rouge_score import rouge_scorer
import pandas as pd

# Load fine-tuned model and tokenizer
model_name = "/kaggle/working/distilbart_finetuned"  # Update path if needed
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load tokenized test dataset
test_dataset = load_from_disk("tokenized_test_dataset")

# Convert dataset to PyTorch format
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Function to generate summaries
def generate_summary(batch, max_length=150, num_beams=5):
    inputs = {k: v.to(device) for k, v in batch.items()}  # Move tensors to device
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # Ensure input tensor shape is valid
    if len(input_ids.shape) == 1:
        input_ids = input_ids.unsqueeze(0)
        attention_mask = attention_mask.unsqueeze(0)
    
    summary_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        length_penalty=2.0,
        early_stopping=True
    )
    
    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Run inference & evaluation
generated_summaries = []
reference_summaries = []
rouge_scores = []

for i in tqdm(range(len(test_dataset)), desc="Testing"):
    sample = test_dataset[i]  # Get each test sample
    generated_summary = generate_summary(sample)[0]  # Generate summary
    reference_summary = tokenizer.decode(sample["labels"], skip_special_tokens=True)  # Ground truth

    generated_summaries.append(generated_summary)
    reference_summaries.append(reference_summary)

    # Compute ROUGE scores
    scores = scorer.score(reference_summary, generated_summary)
    rouge_scores.append({
        "rouge1": scores["rouge1"].fmeasure,
        "rouge2": scores["rouge2"].fmeasure,
        "rougeL": scores["rougeL"].fmeasure,
    })

# Convert scores to DataFrame for analysis
df_scores = pd.DataFrame(rouge_scores)

# Print average ROUGE scores
print("\n🔍 **Average ROUGE Scores:**")
print(f"ROUGE-1: {df_scores['rouge1'].mean():.4f}")
print(f"ROUGE-2: {df_scores['rouge2'].mean():.4f}")
print(f"ROUGE-L: {df_scores['rougeL'].mean():.4f}")

# Save generated summaries & scores
output_file = "generated_summaries.txt"
scores_file = "rouge_scores.csv"

with open(output_file, "w") as f:
    for summary in generated_summaries:
        f.write(summary + "\n")

df_scores.to_csv(scores_file, index=False)

print(f"\n✅ Summaries saved to {output_file}")
print(f"✅ ROUGE scores saved to {scores_file}")


Testing: 100%|██████████| 1000/1000 [21:21<00:00,  1.28s/it]


🔍 **Average ROUGE Scores:**
ROUGE-1: 0.0110
ROUGE-2: 0.0020
ROUGE-L: 0.0082

✅ Summaries saved to generated_summaries.txt
✅ ROUGE scores saved to rouge_scores.csv





In [22]:
print(test_dataset['target'][0])
print(test_dataset['generated_summary'][0])

Well-made locks, Work well on zippers and small lockers, Company is responsive if you have problems with the locks, Locks come in orange or black. Setting the combination can be a little difficult, Some people have a hard time reopening their locks


KeyError: "Column generated_summary not in the dataset. Current columns in the dataset: ['reviews', 'pros', 'cons', 'summary', 'target', 'input_ids', 'attention_mask', 'labels']"

In [23]:
mooc_df = pd.read_csv("/kaggle/input/mooc-dataset/mooc_coursera_dataset.csv")

In [24]:
mooc_df.shape

(45096, 6)

In [25]:
mooc_df = mooc_df.drop_duplicates()

In [26]:
mooc_df = mooc_df.dropna(subset=["reviews"])

In [27]:
mooc_df.shape

(17417, 6)

In [28]:
course_reviews_dict = mooc_df.groupby("course_title")["reviews"].apply(list).to_dict()
for key,value in course_reviews_dict.items():
  print(key,len(value))

Agile with Atlassian Jira 807
Become a CBRS Certified Professional Installer by Google 51
Building Scalable Java Microservices with Spring Boot and Spring Cloud 206
Business Metrics for Data-Driven Companies 273
Data Analysis with Python 1474
Data Science Methodology 1182
Databases and SQL for Data Science 587
Fundamentals of Project Planning and Management 1160
Google Cloud Platform Big Data and Machine Learning Fundamentals 397
Introduction to Data Science in Python 5052
Introduction to User Experience Design 889
Natural Language Processing in TensorFlow 707
Operating Systems and You: Becoming a Power User 2631
Programming Foundations with JavaScript, HTML and CSS 573
Site Reliability Engineering: Measuring and Managing Reliability 163
Supply Chain Principles 351
The Social Context of Mental Health and Illness 92
Visual Elements of User Interface Design 822


In [54]:
print(course_reviews_dict['Become a CBRS Certified Professional Installer by Google'])

["Pretty dry, but I was able to pass with just two complete watches so I'm happy about that. As usual there were some questions on the final exam that were NO WHERE in the course, which is annoying but far better than many microsoft tests I have taken. Never found the suplimental material that the course references... but who cares... i passed!", 'would be a better experience if the video and screen shots would sho on the side of the text that the instructor is going thru so that user does not have to go all the way to beginning of text to be able to view any slides instructor is showing.', 'Information was perfect! The program itself was a little annoying. I had to wait 30 to 45 minutes after watching the videos to to take the quiz. Other than that the information was perfect and passed the test with no issues!', 'A few grammatical mistakes on test made me do a double take but all in all not bad.', 'Excellent course and the training provided was very detailed and easy to follow.', 'So

In [None]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import string

# Download English word list if not already present
nltk.download('punkt')
nltk.download('words')
english_words = set(words.words())

def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(str(text), "html.parser").get_text()

    # Normalize multiple punctuation marks (e.g., !!! becomes !)
    text = re.sub(r'([!?.]){2,}', r'\1', text)

    # Lowercase the text
    text = text.lower()

    # Remove URLs and hyperlinks
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation (except spaces)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Keep only valid English words
    tokens = [word for word in tokens if word in english_words]

    # Re-join tokens and remove extra spaces
    cleaned_text = ' '.join(tokens)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text


In [None]:
for key, values in course_reviews_dict.items():
    cleaned_reviews = [clean_text(review) for review in values]
    course_reviews_dict[key] = cleaned_reviews

In [32]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load Pegasus model and tokenizer
model_path = "/kaggle/working/distilbart_finetuned"  # Your fine-tuned model directory
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


# Move model to GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to GPU/CPU
    summary_ids = model.generate(
    **inputs,
    max_length=256,  # Limit summary length
    num_beams=8,  # Beam search for better summaries
    early_stopping=True,  # Stop once summary makes sense
    no_repeat_ngram_size=3,  # Prevent repeating phrases
    length_penalty=2,  # Encourage concise summaries
    repetition_penalty=1.5  # Reduce excessive repetitions
)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for all courses
summary_dict = {}
for key,values in course_reviews_dict.items():
    course_name = key
    original_text = course_reviews_dict[course_name]
    original_text = " ".join(original_text)
    gen_summ = generate_summary(original_text)
    summary_dict[course_name] = [gen_summ]
    print(course_name, "cmplt")

# Save the results
#summary_df = pd.DataFrame(list(course_summaries.items()), columns=["course_id", "generated_summary"])
#summary_df.to_csv("mooc_summaries.csv", index=False)

# Display some results





Agile with Atlassian Jira cmplt
Become a CBRS Certified Professional Installer by Google cmplt
Building Scalable Java Microservices with Spring Boot and Spring Cloud cmplt
Business Metrics for Data-Driven Companies cmplt
Data Analysis with Python cmplt
Data Science Methodology cmplt
Databases and SQL for Data Science cmplt
Fundamentals of Project Planning and Management cmplt
Google Cloud Platform Big Data and Machine Learning Fundamentals cmplt
Introduction to Data Science in Python cmplt
Introduction to User Experience Design cmplt
Natural Language Processing in TensorFlow cmplt
Operating Systems and You: Becoming a Power User cmplt
Programming Foundations with JavaScript, HTML and CSS cmplt
Site Reliability Engineering: Measuring and Managing Reliability cmplt
Supply Chain Principles cmplt
The Social Context of Mental Health and Illness cmplt
Visual Elements of User Interface Design cmplt


In [33]:
for key,values in summary_dict.items():
    print(key,":")
    print(values, "\n")

Agile with Atlassian Jira :
['A comprehensive overview of how to use Jira on a very basic level, Coursera’s free JIRA-based course is well-organized and easy to follow, A good value for the money, as you get a free online certificate. Some of the content isn’t worth the cost, and some of the topics are fairly basic, The videos are short, so you’ll need to listen to them at 1,5x speed for it to be at a good pace'] 

Become a CBRS Certified Professional Installer by Google :
["Provides a comprehensive overview of what you need to pass the CPI exam, Coursera's free online tutorials are well-organized and easy to follow, The program itself is well-designed and user-friendly. Some of the questions on the final exam were not explicitly covered in the course, which is annoying"] 

Building Scalable Java Microservices with Spring Boot and Spring Cloud :
["The Google Cloud Quicklabs Microservices Microscores Microservices and Spring Boot tutorials are useful for upskilling software developers w

In [34]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_semantic_similarity(original_text, generated_summary):
    """
    Evaluates semantic similarity between the original text and the generated summary.

    Args:
    - original_text (str): The original input text.
    - generated_summary (str): The generated summary.

    Returns:
    - float: Semantic similarity score (range 0 to 1).
    """
    # Load pre-trained sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for original text and summary
    embeddings = model.encode([original_text, generated_summary])

    # Compute cosine similarity
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity_score

# Example Usage

total_score = 0
for key,values in course_reviews_dict.items():
    
    original_text = " ".join(course_reviews_dict[key])
    generated_summary = " ".join(summary_dict[key])

# Evaluate semantic similarity
    similarity_score = evaluate_semantic_similarity(original_text, generated_summary)
    total_score += similarity_score
    print(f"Semantic Similarity Score of :{key} is {similarity_score:.2f}")
    
avg_score = total_score/len(course_reviews_dict)
print(f"Average Semantic Similarity Score: {avg_score:.2f}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Agile with Atlassian Jira is 0.82


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Become a CBRS Certified Professional Installer by Google is 0.41


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Building Scalable Java Microservices with Spring Boot and Spring Cloud is 0.65


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Business Metrics for Data-Driven Companies is 0.64


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Data Analysis with Python is 0.52


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Data Science Methodology is 0.39


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Databases and SQL for Data Science is 0.47


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Fundamentals of Project Planning and Management is 0.71


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Google Cloud Platform Big Data and Machine Learning Fundamentals is 0.64


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Introduction to Data Science in Python is 0.52


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Introduction to User Experience Design is 0.46


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Natural Language Processing in TensorFlow is 0.57


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Operating Systems and You: Becoming a Power User is 0.55


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Programming Foundations with JavaScript, HTML and CSS is 0.63


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Site Reliability Engineering: Measuring and Managing Reliability is 0.52


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Supply Chain Principles is 0.48


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :The Social Context of Mental Health and Illness is 0.67


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Visual Elements of User Interface Design is 0.36
Average Semantic Similarity Score: 0.56


In [40]:
total_or = 0
total_sum = 0
for key,values in course_reviews_dict.items():
    original_text_len = " ".join(course_reviews_dict[key])
    summary_len = " ".join(summary_dict[key]) 
    print(key)
    total_or += len(original_text_len)
    total_sum += len(summary_len) 
    print('Length of the Original Text:', len(original_text_len))

    print("Length of the Summarized Text:", len(summary_len))
    print("Summary Ratio:", 100*(len(summary_len)/len(original_text_len)), "%")
avg = 100*(total_sum/total_or)
print("average:", avg,"%")

Agile with Atlassian Jira
Length of the Original Text: 98084
Length of the Summarized Text: 384
Summary Ratio: 0.3915011622690755 %
Become a CBRS Certified Professional Installer by Google
Length of the Original Text: 10270
Length of the Summarized Text: 300
Summary Ratio: 2.9211295034079843 %
Building Scalable Java Microservices with Spring Boot and Spring Cloud
Length of the Original Text: 22370
Length of the Summarized Text: 515
Summary Ratio: 2.3021904336164507 %
Business Metrics for Data-Driven Companies
Length of the Original Text: 63162
Length of the Summarized Text: 383
Summary Ratio: 0.6063772521452772 %
Data Analysis with Python
Length of the Original Text: 525980
Length of the Summarized Text: 455
Summary Ratio: 0.08650519031141869 %
Data Science Methodology
Length of the Original Text: 200617
Length of the Summarized Text: 473
Summary Ratio: 0.23577264140127707 %
Databases and SQL for Data Science
Length of the Original Text: 118992
Length of the Summarized Text: 407
Summar

In [41]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate_coherence(text):
    """
    Evaluates coherence by computing cosine similarity between consecutive sentence embeddings.

    Args:
    - text (str): The input text to evaluate.

    Returns:
    - float: Average coherence score (range 0 to 1).
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = text.split(". ")  # Simple sentence segmentation
    embeddings = model.encode(sentences)
    
    if len(embeddings) < 2:
        return 1.0  # If only one sentence, coherence is perfect
    
    similarities = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] 
                    for i in range(len(embeddings) - 1)]
    
    return np.mean(similarities)

# Example Usage
total_coherence = 0
for key, values in summary_dict.items():
    generated_summary = " ".join(values)
    coherence_score = evaluate_coherence(generated_summary)
    total_coherence += coherence_score
    print(f"Coherence Score for {key}: {coherence_score:.2f}")

avg_coherence = total_coherence / len(summary_dict)
print(f"Average Coherence Score: {avg_coherence:.2f}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Agile with Atlassian Jira: 0.32


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Become a CBRS Certified Professional Installer by Google: 0.33


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Building Scalable Java Microservices with Spring Boot and Spring Cloud: 0.50


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Business Metrics for Data-Driven Companies: 0.34


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Data Analysis with Python: 0.26


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Data Science Methodology: 0.16


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Databases and SQL for Data Science: 0.24


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Fundamentals of Project Planning and Management: 0.47


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Google Cloud Platform Big Data and Machine Learning Fundamentals: 0.29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Introduction to Data Science in Python: 0.26


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Introduction to User Experience Design: 0.27


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Natural Language Processing in TensorFlow: 0.43


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Operating Systems and You: Becoming a Power User: 0.38


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Programming Foundations with JavaScript, HTML and CSS: 0.23


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Site Reliability Engineering: Measuring and Managing Reliability: 0.54


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Supply Chain Principles: 0.37


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for The Social Context of Mental Health and Illness: 0.50


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Visual Elements of User Interface Design: 0.41
Average Coherence Score: 0.35


In [42]:
model.save_pretrained("distilbart_finetuned")
tokenizer.save_pretrained("distilbart_finetuned")



('distilbart_finetuned/tokenizer_config.json',
 'distilbart_finetuned/special_tokens_map.json',
 'distilbart_finetuned/vocab.json',
 'distilbart_finetuned/merges.txt',
 'distilbart_finetuned/added_tokens.json')

In [43]:
import shutil

# Zip the model directory
shutil.make_archive("distilbart_finetuned", 'zip', "distilbart_finetuned")

'/kaggle/working/distilbart_finetuned.zip'

In [44]:
from IPython.display import FileLink

FileLink(r'distilbart_finetuned.zip')
