In [2]:
import zipfile
import json
import io
import os



In [3]:
def read_json_from_folder(folder_path):
    data = {}
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in json_files:
        file_path = os.path.join(folder_path, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data[json_file] = json.load(f)
    
    return data


In [4]:
dataset_path = "/kaggle/input/amaa-sum/min_10_max_100_revs_filt_complete"

train_path = os.path.join(dataset_path, "train")
test_path = os.path.join(dataset_path, "test")
valid_path = os.path.join(dataset_path, "valid")


In [5]:
train_data = read_json_from_folder(train_path)
test_data = read_json_from_folder(test_path)
valid_data = read_json_from_folder(valid_path)

# Print a sample JSON file
sample_file = list(train_data.keys())[0]
print(f"Sample file: {sample_file}\nData:\n", train_data[sample_file])

Sample file: B00LBFFSNM.json
Data:
 {'website_summaries': [{'verdict': "With its fun design and cheap price, the Lumia 635 is a decent option to consider if you're after 4G LTE on a budget. It's only marginally more expensive than its near-identical 3G-only Lumia 630 sibling, so you should certainly opt for the 635. If you want a better selection of apps however, the Motorola Moto G is still the 4G phone to go for.", 'pros': ['The Nokia Lumia 635 has an affordable price, its interchangeable cases are colourful and attractive and it has 4G LTE'], 'cons': ["Adding 4G LTE has meant the battery life has taken a hit, its screen resolution is unimpressive and its camera isn't really up to anything more than the odd Instagram snap"], 'aspects': {'Design': 7.0, 'Features': 7.0, 'Performance': 5.0}, 'publication_date': 20140807, 'rating': 7.5, 'source': 'cnet'}], 'customer_reviews': [{'title': 'this is the absolute best phone for the money in my opinion', 'text': 'As long as this is still avail

In [6]:
import os
import json
import pandas as pd
from tqdm import tqdm

def prepare_dataset(folder_path):
    """Reads JSON files and prepares data for summarization, including pros and cons."""
    data_list = []
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    for json_file in tqdm(json_files, desc=f"Processing {folder_path}"):
        file_path = os.path.join(folder_path, json_file)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

            # Extract all customer reviews
            reviews = []
            if "customer_reviews" in data:
                for review in data["customer_reviews"]:
                    reviews.append(review.get("text", ""))

            # Extract summary, pros, and cons
            if "website_summaries" in data:
                summary_data = data["website_summaries"][0]  # First summary
                summary = summary_data.get("verdict", "")
                pros = ", ".join(summary_data.get("pros", []))  # Convert list to string
                cons = ", ".join(summary_data.get("cons", []))  # Convert list to string

                if reviews and summary:
                    combined_reviews = " ".join(reviews)  # Merge all reviews
                    data_list.append({
                        "reviews": combined_reviews,
                        "pros": pros,
                        "cons": cons,
                        "summary": summary
                    })

    return pd.DataFrame(data_list)

# Define dataset paths
dataset_path = "/kaggle/input/amaa-sum/min_10_max_100_revs_filt_complete"

# Prepare Train, Validation, and Test Data
train_df = prepare_dataset(os.path.join(dataset_path, "train"))
valid_df = prepare_dataset(os.path.join(dataset_path, "valid"))
test_df = prepare_dataset(os.path.join(dataset_path, "test"))

# Save to CSV for later use
train_df.to_csv("bart_train_data.csv", index=False)
valid_df.to_csv("bart_valid_data.csv", index=False)
test_df.to_csv("bart_test_data.csv", index=False)




Processing /kaggle/input/amaa-sum/min_10_max_100_revs_filt_complete/train: 100%|██████████| 25203/25203 [00:44<00:00, 566.70it/s]
Processing /kaggle/input/amaa-sum/min_10_max_100_revs_filt_complete/valid: 100%|██████████| 3114/3114 [00:05<00:00, 612.41it/s]
Processing /kaggle/input/amaa-sum/min_10_max_100_revs_filt_complete/test: 100%|██████████| 3166/3166 [00:04<00:00, 686.17it/s]


In [15]:
# Display sample
print("Train Data Sample:")
print(train_df['reviews'][0])
print("\nValidation Data Sample:")
print(valid_df['summary'][0])
print("\nTest Data Sample:")
print(test_df['pros'][0])
print(test_df['cons'][0])

Train Data Sample:
As long as this is still available, this is the absolute best phone for the money in my opinion. I researched seemingly forever for a low priced full-featured phone, and this fits the bill. The Windows O.S. is seamless and easy to customize and navigate. I just bought two more for my wife and daughter, who were always "borrowing" mine. I just have to chuckle when I see everyone swooning over the latest exhorbitantly priced Apple phones, and paying the equvalent of a car payment to do the same things this phone does. And my $30/ month plan is adequate for my business calling, email checking and file sending needs, which is all I need it for. Effective July 31 2017 Microsoft no longer supports software on this phone. It took two hours at At&t to get contacts from old phone to this one. Only buy if you have a way to transfer your contacts and are a very knowledgeable Windows user. Design is poor as edges of screen are widest part of phone and not Gorilla glass, so any d

In [12]:
import pandas as pd
a = pd.read_csv("bart_train_data.csv")


In [13]:
a.shape

(25203, 4)

In [7]:
import pandas as pd
from datasets import Dataset
from transformers import BartTokenizer
from tqdm import tqdm

# Load CSV data with controlled size
def load_data(file_path, num_samples):
    df = pd.read_csv(file_path)
    df = df.sample(n=num_samples, random_state=42).reset_index(drop=True)  # Shuffle and take subset
    df["input_text"] = df["reviews"]  # BART does not need "summarize:" prefix
    df["target_text"] = df["pros"] + ". " + df["cons"]
    return Dataset.from_pandas(df)

# Load dataset paths
train_path = "bart_train_data.csv"
valid_path = "bart_valid_data.csv"
test_path = "bart_test_data.csv"

# Load specific number of samples
train_dataset = load_data(train_path, 10000)
valid_dataset = load_data(valid_path, 2500)
test_dataset = load_data(test_path, 1000)

# Load BART tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], padding="max_length", truncation=True, max_length=512
    )
    labels = tokenizer(
        examples["target_text"], padding="max_length", truncation=True, max_length=256
    )
    model_inputs["labels"] = labels["input_ids"]  # Correcting label assignment
    return model_inputs

# Tokenizing datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Save tokenized datasets to disk
train_dataset.save_to_disk("tokenized_train_dataset")
valid_dataset.save_to_disk("tokenized_valid_dataset")
test_dataset.save_to_disk("tokenized_test_dataset")

# Print first sample to verify
print(train_dataset[0])


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

{'reviews': 'If you have to deal with numbers, you need this pad for your laptop. Simple to use, just plug it into your usb port and go. I take the time to review purchases because I know that I rely on them myself and I like to try to help others make informed decisions. If my review was helpful to you, please click the "Helpful" button. Descent external 9 key keyboard, however it does not have a "delete" button (don\'t get fooled by the "Back Space" button) which is why I really wanted the external keypad. It is a little thicker than expected. I returned it because it didn\'t have a delete button. This keypad replaced a Belkin wireless keypad, which required a battery. I found that the battery was usually "dead" when I wanted to use it because it would turn on when a key was pressed. This was handy when preparing to use it, but a problem when traveling with it in a suitcase! The Satechi keypad plugs into my computer, so there\'ll be no problem with dead batteries. I\'ve only used it 

In [19]:
from datasets import load_from_disk
test_dataset = load_from_disk("tokenized_train_dataset")
print(test_dataset['target_text'][1])

60-piece set includes 4 large squares, 24 small squares, 18 small triangles, 8 medium triangles and 6 large triangles, Compatible with other PicassoTiles sets. Tiles can come apart, spilling magnets on the floor, so keep away from babies and young toddlers


In [26]:
import torch
print(torch.cuda.is_available())  # Should return True if GPU is detected
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the GPU
print(torch.cuda.memory_allocated() / 1024**3, "GB Allocated")  # Memory used
print(torch.cuda.memory_reserved() / 1024**3, "GB Reserved")  # Memory reserved


True
2
Tesla T4
2.941657543182373 GB Allocated
3.736328125 GB Reserved


In [24]:
import torch
torch.cuda.empty_cache()


In [25]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [20]:
pip install transformers datasets torch tqdm


Note: you may need to restart the kernel to use updated packages.


In [8]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_from_disk

# Load tokenized datasets
train_dataset = load_from_disk("tokenized_train_dataset")
valid_dataset = load_from_disk("tokenized_valid_dataset")

# Load model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bart-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  # Keep only the best two models
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_steps=1000,
    eval_steps=1000,
    report_to="none",
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Train model
trainer.train()

# Save best model
trainer.save_model("./best_bart_model")
tokenizer.save_pretrained("./best_bart_model")


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,1.0953,0.808296
2,0.7539,0.794872


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./best_bart_model/tokenizer_config.json',
 './best_bart_model/special_tokens_map.json',
 './best_bart_model/vocab.json',
 './best_bart_model/merges.txt',
 './best_bart_model/added_tokens.json')

In [9]:

model.save_pretrained("best_bart_model")
tokenizer.save_pretrained("best_bart_model")


('best_bart_model/tokenizer_config.json',
 'best_bart_model/special_tokens_map.json',
 'best_bart_model/vocab.json',
 'best_bart_model/merges.txt',
 'best_bart_model/added_tokens.json')

In [10]:
import shutil

# Zip the model directory
shutil.make_archive("best_bart_model", 'zip', "best_bart_model")


'/kaggle/working/best_bart_model.zip'

In [11]:
from IPython.display import FileLink

FileLink(r'best_bart_model.zip')



In [12]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_from_disk

# Load fine-tuned BART model and tokenizer
model_path = "/kaggle/working/best_bart_model"  # Replace with the path to your fine-tuned BART model
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)

# Load tokenized test dataset
test_dataset = load_from_disk("tokenized_test_dataset")

# Convert to PyTorch format
# For BART, usually ["input_ids", "attention_mask", "labels"] are enough
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [13]:
print(test_dataset['target_text'][0])

Well-made locks, Work well on zippers and small lockers, Company is responsive if you have problems with the locks, Locks come in orange or black. Setting the combination can be a little difficult, Some people have a hard time reopening their locks


In [14]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=4cbac8f8922dcb2ae00e1e69f02380b6978bbee8d68c94dd01042d7726d28792
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [15]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [16]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
from tqdm import tqdm
from datasets import load_from_disk

# Load best fine-tuned model
model_path = "/kaggle/working/best_bart_model"
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)
model.eval()

# Load tokenized test dataset
test_dataset = load_from_disk("tokenized_test_dataset")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Generate summaries
def generate_summary(sample, max_length=150, num_beams=5):
    inputs = {
        "input_ids": torch.tensor(sample["input_ids"]).unsqueeze(0).to(device),
        "attention_mask": torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device),
    }
    summary_ids = model.generate(**inputs, max_length=max_length, num_beams=num_beams)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Generate summaries for the test set
generated_summaries = []
reference_summaries = []

for i in tqdm(range(len(test_dataset)), desc="Testing"):
    sample = test_dataset[i]
    generated_summaries.append(generate_summary(sample))
    reference_summaries.append(tokenizer.decode(sample["labels"], skip_special_tokens=True))

# Save results
import pandas as pd

df = pd.DataFrame({"Generated Summary": generated_summaries, "Reference Summary": reference_summaries})
df.to_csv("test_results.csv", index=False)
print("Test results saved to test_results.csv")


Testing: 100%|██████████| 1000/1000 [27:25<00:00,  1.65s/it]

Test results saved to test_results.csv





In [17]:
from rouge_score import rouge_scorer
import numpy as np

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Store scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Evaluation loop
for i in tqdm(range(len(test_dataset)), desc="Evaluating ROUGE"):
    sample = test_dataset[i]
    
    # Generate summary
    generated_summary = generate_summary(sample)  
    reference_summary = tokenizer.decode(sample["labels"], skip_special_tokens=True)

    # Compute ROUGE scores
    scores = scorer.score(reference_summary, generated_summary)

    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

# Compute average scores
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

# Print final results
print("\n💡 ROUGE Evaluation Results:")
print(f"ROUGE-1: {avg_rouge1:.4f}")
print(f"ROUGE-2: {avg_rouge2:.4f}")
print(f"ROUGE-L: {avg_rougeL:.4f}")


Evaluating ROUGE: 100%|██████████| 1000/1000 [27:33<00:00,  1.65s/it]


💡 ROUGE Evaluation Results:
ROUGE-1: 0.0680
ROUGE-2: 0.0113
ROUGE-L: 0.0463





In [18]:
mooc_df = pd.read_csv("/kaggle/input/moocc-dataset/mooc_coursera_dataset.csv")

In [19]:
mooc_df.shape

(45096, 6)

In [20]:
mooc_df = mooc_df.drop_duplicates()

In [21]:
mooc_df = mooc_df.dropna(subset=["reviews"])

In [22]:
mooc_df.shape

(17417, 6)

In [23]:
course_reviews_dict = mooc_df.groupby("course_title")["reviews"].apply(list).to_dict()
for key,value in course_reviews_dict.items():
  print(key,len(value))

Agile with Atlassian Jira 807
Become a CBRS Certified Professional Installer by Google 51
Building Scalable Java Microservices with Spring Boot and Spring Cloud 206
Business Metrics for Data-Driven Companies 273
Data Analysis with Python 1474
Data Science Methodology 1182
Databases and SQL for Data Science 587
Fundamentals of Project Planning and Management 1160
Google Cloud Platform Big Data and Machine Learning Fundamentals 397
Introduction to Data Science in Python 5052
Introduction to User Experience Design 889
Natural Language Processing in TensorFlow 707
Operating Systems and You: Becoming a Power User 2631
Programming Foundations with JavaScript, HTML and CSS 573
Site Reliability Engineering: Measuring and Managing Reliability 163
Supply Chain Principles 351
The Social Context of Mental Health and Illness 92
Visual Elements of User Interface Design 822


In [39]:
print(course_reviews_dict['Become a CBRS Certified Professional Installer by Google'])

["Pretty dry, but I was able to pass with just two complete watches so I'm happy about that. As usual there were some questions on the final exam that were NO WHERE in the course, which is annoying but far better than many microsoft tests I have taken. Never found the suplimental material that the course references... but who cares... i passed!", 'would be a better experience if the video and screen shots would sho on the side of the text that the instructor is going thru so that user does not have to go all the way to beginning of text to be able to view any slides instructor is showing.', 'Information was perfect! The program itself was a little annoying. I had to wait 30 to 45 minutes after watching the videos to to take the quiz. Other than that the information was perfect and passed the test with no issues!', 'A few grammatical mistakes on test made me do a double take but all in all not bad.', 'Excellent course and the training provided was very detailed and easy to follow.', 'So

In [28]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

# Load Pegasus model and tokenizer
model_path = "/kaggle/working/best_bart_model"  # Your fine-tuned model directory
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)
model.eval()


# Move model to GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move to GPU/CPU
    summary_ids = model.generate(
    **inputs,
    max_length=256,  # Limit summary length
    num_beams=5,  # Beam search for better summaries
    early_stopping=True,  # Stop once summary makes sense
    no_repeat_ngram_size=3,  # Prevent repeating phrases
    length_penalty=1.5,  # Encourage concise summaries
    repetition_penalty=2.0  # Reduce excessive repetitions
)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for all courses
summary_dict = {}
for key,values in course_reviews_dict.items():
    course_name = key
    original_text = course_reviews_dict[course_name]
    original_text = " ".join(original_text)
    gen_summ = generate_summary(original_text)
    summary_dict[course_name] = [gen_summ]
    print(course_name, "cmplt")

# Save the results
#summary_df = pd.DataFrame(list(course_summaries.items()), columns=["course_id", "generated_summary"])
#summary_df.to_csv("mooc_summaries.csv", index=False)

# Display some results



Agile with Atlassian Jira cmplt
Become a CBRS Certified Professional Installer by Google cmplt
Building Scalable Java Microservices with Spring Boot and Spring Cloud cmplt
Business Metrics for Data-Driven Companies cmplt
Data Analysis with Python cmplt
Data Science Methodology cmplt
Databases and SQL for Data Science cmplt
Fundamentals of Project Planning and Management cmplt
Google Cloud Platform Big Data and Machine Learning Fundamentals cmplt
Introduction to Data Science in Python cmplt
Introduction to User Experience Design cmplt
Natural Language Processing in TensorFlow cmplt
Operating Systems and You: Becoming a Power User cmplt
Programming Foundations with JavaScript, HTML and CSS cmplt
Site Reliability Engineering: Measuring and Managing Reliability cmplt
Supply Chain Principles cmplt
The Social Context of Mental Health and Illness cmplt
Visual Elements of User Interface Design cmplt


In [29]:
for key,values in summary_dict.items():
    print(key,":")
    print(values, "\n")

Agile with Atlassian Jira :
["This online course is designed to help you learn how to use Jira in a variety of ways, It focuses on the basics of using Jira as well as how to set up and use it effectively, Most of the content is free, so you don't have to pay for a subscription. Some of the topics are more technical than others, but most of the material is useful for those who want to get started with Jira right away"] 

Become a CBRS Certified Professional Installer by Google :
['Provides all the information you need to pass the CPI exam, including how to study for and prepare for the exam, as well as how to take the exam itself, Includes a lot of helpful online resources that are easy to follow, Works well for both beginners and advanced users. Some questions on the final exam were not covered in the course, which may make it more difficult for some to pass'] 

Building Scalable Java Microservices with Spring Boot and Spring Cloud :
["A comprehensive online course that covers everythi

In [30]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_semantic_similarity(original_text, generated_summary):
    """
    Evaluates semantic similarity between the original text and the generated summary.

    Args:
    - original_text (str): The original input text.
    - generated_summary (str): The generated summary.

    Returns:
    - float: Semantic similarity score (range 0 to 1).
    """
    # Load pre-trained sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embeddings for original text and summary
    embeddings = model.encode([original_text, generated_summary])

    # Compute cosine similarity
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return similarity_score

# Example Usage

total_score = 0
for key,values in course_reviews_dict.items():
    
    original_text = " ".join(course_reviews_dict[key])
    generated_summary = " ".join(summary_dict[key])

# Evaluate semantic similarity
    similarity_score = evaluate_semantic_similarity(original_text, generated_summary)
    total_score += similarity_score
    print(f"Semantic Similarity Score of :{key} is {similarity_score:.2f}")
    
avg_score = total_score/len(course_reviews_dict)
print(f"Average Semantic Similarity Score: {avg_score:.2f}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Agile with Atlassian Jira is 0.73


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Become a CBRS Certified Professional Installer by Google is 0.37


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Building Scalable Java Microservices with Spring Boot and Spring Cloud is 0.62


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Business Metrics for Data-Driven Companies is 0.77


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Data Analysis with Python is 0.52


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Data Science Methodology is 0.43


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Databases and SQL for Data Science is 0.57


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Fundamentals of Project Planning and Management is 0.64


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Google Cloud Platform Big Data and Machine Learning Fundamentals is 0.65


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Introduction to Data Science in Python is 0.52


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Introduction to User Experience Design is 0.62


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Natural Language Processing in TensorFlow is 0.56


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Operating Systems and You: Becoming a Power User is 0.56


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Programming Foundations with JavaScript, HTML and CSS is 0.62


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Site Reliability Engineering: Measuring and Managing Reliability is 0.42


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Supply Chain Principles is 0.48


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :The Social Context of Mental Health and Illness is 0.68


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic Similarity Score of :Visual Elements of User Interface Design is 0.50
Average Semantic Similarity Score: 0.57


In [32]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate_coherence(text):
    """
    Evaluates coherence by computing cosine similarity between consecutive sentence embeddings.

    Args:
    - text (str): The input text to evaluate.

    Returns:
    - float: Average coherence score (range 0 to 1).
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = text.split(". ")  # Simple sentence segmentation
    embeddings = model.encode(sentences)
    
    if len(embeddings) < 2:
        return 1.0  # If only one sentence, coherence is perfect
    
    similarities = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] 
                    for i in range(len(embeddings) - 1)]
    
    return np.mean(similarities)

# Example Usage
total_coherence = 0
for key, values in summary_dict.items():
    generated_summary = " ".join(values)
    coherence_score = evaluate_coherence(generated_summary)
    total_coherence += coherence_score
    print(f"Coherence Score for {key}: {coherence_score:.2f}")

avg_coherence = total_coherence / len(summary_dict)
print(f"Average Coherence Score: {avg_coherence:.2f}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Agile with Atlassian Jira: 0.69


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Become a CBRS Certified Professional Installer by Google: 0.32


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Building Scalable Java Microservices with Spring Boot and Spring Cloud: 0.05


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Business Metrics for Data-Driven Companies: 0.48


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Data Analysis with Python: 0.34


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Data Science Methodology: 0.66


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Databases and SQL for Data Science: 0.40


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Fundamentals of Project Planning and Management: 0.15


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Google Cloud Platform Big Data and Machine Learning Fundamentals: 0.34


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Introduction to Data Science in Python: 0.34


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Introduction to User Experience Design: 0.29


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Natural Language Processing in TensorFlow: 0.09


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Operating Systems and You: Becoming a Power User: 0.40


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Programming Foundations with JavaScript, HTML and CSS: 0.33


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Site Reliability Engineering: Measuring and Managing Reliability: 0.27


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Supply Chain Principles: 0.23


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for The Social Context of Mental Health and Illness: 0.17


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Coherence Score for Visual Elements of User Interface Design: 0.41
Average Coherence Score: 0.33


In [31]:
for key,values in course_reviews_dict.items():
    original_text_len = " ".join(course_reviews_dict[key])
    summary_len = " ".join(summary_dict[key]) 
    print(key)
    print('Length of the Original Text:', len(original_text_len))

    print("Length of the Summarized Text:", len(summary_len))
    print("Summary Ratio:", 100*(len(summary_len)/len(original_text_len)), "%")

Agile with Atlassian Jira
Length of the Original Text: 98084
Length of the Summarized Text: 389
Summary Ratio: 0.3965988336527874 %
Become a CBRS Certified Professional Installer by Google
Length of the Original Text: 10270
Length of the Summarized Text: 385
Summary Ratio: 3.7487828627069133 %
Building Scalable Java Microservices with Spring Boot and Spring Cloud
Length of the Original Text: 22370
Length of the Summarized Text: 425
Summary Ratio: 1.899865891819401 %
Business Metrics for Data-Driven Companies
Length of the Original Text: 63162
Length of the Summarized Text: 438
Summary Ratio: 0.693454925429847 %
Data Analysis with Python
Length of the Original Text: 525980
Length of the Summarized Text: 408
Summary Ratio: 0.07756948933419522 %
Data Science Methodology
Length of the Original Text: 200617
Length of the Summarized Text: 388
Summary Ratio: 0.19340335066320402 %
Databases and SQL for Data Science
Length of the Original Text: 118992
Length of the Summarized Text: 390
Summary 