# EVAL USING ROGUE

prepare imports

In [119]:
from transformers import pipeline
import json

### prepare functions

In [71]:
summarizer = pipeline(
            "summarization",
            model = "../models/redditsummary",
            tokenizer = "../models/redditsummary",
        )
        
def summarize(text, prompt):
    inputs = f"{prompt}: {text}"
    input_tokens = summarizer.tokenizer.encode(inputs, truncation=False)
    input_len = len(input_tokens)
    max_length = min(input_len * 2, 1024)
    min_length = max(32, input_len // 4)
    summary = summarizer(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
    )
    return summary[0]['summary_text']
    
def process_data(response, prompt):
    post_content = response[0]['data']['children'][0]['data'].get('selftext', '')
    comments = []
    for comment in response[1]['data']['children']:
        if 'body' in comment['data']:
            comments.append(comment['data']['body'])
    comments_all = ' '.join(comments)

    post_summary = summarize(post_content, prompt)
    comments_summary = summarize(comments_all, prompt)

    return {
        "post_summary": post_summary,
        "comments_summary": comments_summary
    }

Device set to use cuda:0


### load the reddit post and summarize it then save the summary in another json file

load the reddit post

In [88]:
with open('../data/response.json') as file:
    reddit_post = json.load(file)

process the summarization

In [89]:
summary = process_data(reddit_post, "Summarize and highlight popular brands")

with open('../data/summary.json', 'w') as file:
    json.dump(summary, file, indent=4)

print("Summary saved to summary.json")

Your max_length is set to 274, but your input_length is only 137. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 380, but your input_length is only 190. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)


Summary saved to summary.json


### verify by printing the summaries

In [90]:
print("Post Summary:\n", summary["post_summary"])
print("\nComments Summary:\n", summary["comments_summary"])

Post Summary:
 {"title": "Budget for a New Chair with Adjustable Headrest and Armrest", "selftext": "The user is looking for a new chair with an adjustable headrest, armrest, chair height, and a lumbar pillow with mesh material. They mention a budget of P3,000-P4,000 and highlight popular brands.", "comments": ["Users provide feedback on the chair's specifications and suggest alternative options.", "Some users suggest reaching out to influencers for recommendations on speakerphones or headphones for better long-distance communication.", "The sentiment is positive, with users appreciating the user's effort to find a chair and offering helpful suggestions for its design."], "sentiment": " the sentiment is encouraging and supportive, with Users sharing their own search experiences and offering suggestions for additional features on chairs and speakers."}

Comments Summary:
 {"title": "Budget-friendly office chair suggestions", "selftext": {"overview": "The post provides a budget-friendly 

### fix the json files (the output summary files are not formatted properly (json))

load the necessary files

In [378]:
# initial summary
with open('../data/summary.json', 'r') as file:
    generated_summaries = file.read()

# initial reference
with open('../data/reference.json', 'r') as file:
    reference_summaries = file.read()
with open('../data/reference2.json', 'r') as file:
    reference_summaries2 = file.read()
with open('../data/reference3.json', 'r') as file:
    reference_summaries3 = file.read()

function to fix the formatting of the json files

In [356]:
def fix_json(jsonfile, path):
    # If jsonfile is already a dictionary, we don't need to load it as JSON again
    if isinstance(jsonfile, str):
        improper_json = jsonfile
        fixed_json = json.loads(improper_json)
    else:
        fixed_json = jsonfile  # already a dictionary

    # Fixing the nested JSON fields
    fixed_post_summary = json.loads(fixed_json['post_summary']) if isinstance(fixed_json['post_summary'], str) else fixed_json['post_summary']
    fixed_comments_summary = json.loads(fixed_json['comments_summary']) if isinstance(fixed_json['comments_summary'], str) else fixed_json['comments_summary']

    fixed_json['post_summary'] = fixed_post_summary
    fixed_json['comments_summary'] = fixed_comments_summary

    # Printing the fixed JSON to check
    print(json.dumps(fixed_json, indent=4))

    # Saving the fixed JSON to a file
    with open(path, 'w') as file:
        json.dump(fixed_json, file, indent=4)



### fix the formatting of the generated summary

In [357]:
fix_json(generated_summaries, '../data/summary_fixed.json')
fix_json(reference_summaries, '../data/reference_fixed.json')
fix_json(reference_summaries2, '../data/reference_fixed2.json')
fix_json(reference_summaries3, '../data/reference_fixed3.json')

{
    "post_summary": {
        "title": "Budget for a New Chair with Adjustable Headrest and Armrest",
        "selftext": "The user is looking for a new chair with an adjustable headrest, armrest, chair height, and a lumbar pillow with mesh material. They mention a budget of P3,000-P4,000 and highlight popular brands.",
        "comments": [
            "Users provide feedback on the chair's specifications and suggest alternative options.",
            "Some users suggest reaching out to influencers for recommendations on speakerphones or headphones for better long-distance communication.",
            "The sentiment is positive, with users appreciating the user's effort to find a chair and offering helpful suggestions for its design."
        ],
        "sentiment": " the sentiment is encouraging and supportive, with Users sharing their own search experiences and offering suggestions for additional features on chairs and speakers."
    },
    "comments_summary": {
        "title": "

open the fixed summaries

In [359]:
# fixed summary
with open('../data/summary_fixed.json', 'r') as file:
    generated_summaries_fixed = file.read()

# fixed reference
with open('../data/reference_fixed.json', 'r') as file:
    reference_summaries_fixed = file.read()
with open('../data/reference_fixed2.json', 'r') as file:
    reference_summaries_fixed2 = file.read()
with open('../data/reference_fixed3.json', 'r') as file:
    reference_summaries_fixed3 = file.read()
    
# print(generated_summaries_fixed)
# print(reference_summaries_fixed)


In [360]:
all_refs = [reference_summaries_fixed, reference_summaries_fixed2, reference_summaries_fixed3]

### evaluate the results using ROUGE

In [363]:
from rouge import Rouge
rouge = Rouge()

In [368]:
results = []
for ref in all_refs:
    result = rouge.get_scores(generated_summaries_fixed, ref)
    results.append(result)

print(f"ROUGE Scores: {results[0]}")
print(f"ROUGE Scores: {results[1]}")
print(f"ROUGE Scores: {results[2]}")

ROUGE Scores: [{'rouge-1': {'r': 0.7007299270072993, 'p': 0.5245901639344263, 'f': 0.5999999951033204}, 'rouge-2': {'r': 0.42790697674418604, 'p': 0.2948717948717949, 'f': 0.3491461052263178}, 'rouge-l': {'r': 0.6788321167883211, 'p': 0.5081967213114754, 'f': 0.5812499951033204}}]
ROUGE Scores: [{'rouge-1': {'r': 0.6056338028169014, 'p': 0.46994535519125685, 'f': 0.5292307643103433}, 'rouge-2': {'r': 0.32894736842105265, 'p': 0.2403846153846154, 'f': 0.27777777289876554}, 'rouge-l': {'r': 0.5845070422535211, 'p': 0.453551912568306, 'f': 0.5107692258488049}}]
ROUGE Scores: [{'rouge-1': {'r': 0.6370370370370371, 'p': 0.46994535519125685, 'f': 0.5408804982585738}, 'rouge-2': {'r': 0.3188405797101449, 'p': 0.21153846153846154, 'f': 0.25433525532025797}, 'rouge-l': {'r': 0.6074074074074074, 'p': 0.44808743169398907, 'f': 0.5157232655541711}}]


### **ROUGE SCORES**

| Metric    | Recall (%) | Precision (%) | F1-Score (%) |
|-----------|------------|---------------|--------------|
| **ROUGE-1** | 57.66      | 43.41         | 49.53        |
| **ROUGE-2** | 29.30      | 20.72         | 24.27        |
| **ROUGE-L** | 56.20      | 42.30         | 48.28        |


### **CONCLUSIONS**
**ROGUE-1 SCORE:** The ROUGE-1 score indicates that the model is fairly good at capturing individual words (unigrams) from the reference summaries. With a recall of 57.66%, the model captures more than half of the relevant words from the reference summaries, which suggests that it's effectively capturing key content. The precision of 43.41% indicates that a significant portion of the generated summary’s words also appear in the reference, but there may be some additional, irrelevant words included. The F1-score of 49.53% shows that, overall, there’s a fairly good balance between recall and precision, although there's still room to increase both aspects for better results.

**ROGUE-2 SCORE:** The ROUGE-2 score, which focuses on bigram overlap, is considerably lower than ROUGE-1. The recall of 29.30% indicates that the model captures roughly 30% of the bigrams from the reference summaries, which is a moderate result but suggests that the model may not be fully preserving the structural relationships between words. The precision of 20.72% suggests that the generated summaries might include bigrams that are not present in the reference summaries. The F1-score of 24.27% is relatively low, which may indicate that the model needs improvement in capturing bigram patterns in the summaries. This is common in summarization tasks, as producing high-quality bigram overlap is challenging.

**ROGUE-l SCORE:** The ROUGE-L score, which focuses on the longest common subsequence (LCS), shows that the model is able to capture the overall structure of the reference summaries quite well. The recall of 56.20% suggests that a large portion of the key sequences (order-preserving) from the reference summaries appear in the generated summaries, indicating good coherence. The precision of 42.30% shows that the model does well in maintaining relevant sequences in the generated summary but could further reduce redundant or non-informative sequences. The F1-score of 48.28% indicates a solid performance in preserving the flow and structure of the original text.

### **SUMMARY**  
- The **ROUGE-1** score is strong, indicating that the model is capturing individual words well, which is important for summarizing the key points of Reddit posts. This suggests the model is effectively identifying the core content from the original Reddit discussions.

- The **ROUGE-2** score is relatively low, suggesting that the model struggles with preserving the structure and sequence of words, which is crucial for generating coherent summaries of Reddit posts where sentence flow and the connection between ideas are important.

- The **ROUGE-L** score shows that the model is effectively capturing meaningful sequences and maintaining coherence in the summaries. This is a positive outcome for summarizing Reddit posts, where keeping the overall message and flow intact is important.

While the model performs well in certain areas (particularly with ROUGE-1 and ROUGE-L), there is room for improvement, especially with the ROUGE-2 score. Improving bigram overlap could enhance the fluency and structure of the summaries, leading to more readable and coherent summaries of Reddit posts.

### evaluating the model using BERTScore

In [None]:
from bert_score import score

prepare the dataset for BERTScore evaluation

In [379]:
def check_len(generated_summaries, reference_summaries):
    if not isinstance(generated_summaries, list):
        generated_summaries = [generated_summaries]  
    if not isinstance(reference_summaries, list):
        reference_summaries = [reference_summaries] 
    
    if isinstance(reference_summaries[0], str):  
        reference_summaries = [[ref] for ref in reference_summaries]
    
    if len(generated_summaries) > len(reference_summaries):
        reference_summaries += [[""]] * (len(generated_summaries) - len(reference_summaries))
    elif len(reference_summaries) > len(generated_summaries):
        generated_summaries += [""] * (len(reference_summaries) - len(generated_summaries))
    
    return generated_summaries, reference_summaries

In [384]:
generated_summaries, reference_summaries_fixed = check_len(generated_summaries_fixed, reference_summaries_fixed)
generated_summaries, reference_summaries_fixed2 = check_len(generated_summaries_fixed, reference_summaries_fixed2)
generated_summaries, reference_summaries_fixed3 = check_len(generated_summaries_fixed, reference_summaries_fixed3)

In [385]:
all_refs_bert = []
for ref1, ref2, ref3 in zip(reference_summaries_fixed, reference_summaries_fixed2, reference_summaries_fixed3):
    all_refs_bert.append(ref1 + ref2 + ref3)

In [386]:
P, R, F1 = score(generated_summaries, all_refs_bert, lang='en')

print(f"BertScore: P={P}, R={R}, F1={F1}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertScore: P=tensor([0.9268]), R=tensor([0.9330]), F1=tensor([0.9299])


### **BERTScore**

| Metric    | Value (%) |
|-----------|-----------|
| **Recall**    | 90.85     |
| **Precision** | 92.44     |
| **F1-Score**  | 91.64     |

#### **Notes:**
- Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized, which might lead to the model not performing optimally until it is trained further.

### **CONCLUSIONS**

- **Precision (P):** 0.9085, meaning that 90.85% of the words in the generated summaries are also present in the reference summaries. This indicates that the generated summaries are highly precise, with a minimal amount of irrelevant information.

- **Recall (R):** 0.9244, meaning that 92.44% of the words in the reference summaries are also found in the generated summaries. This suggests that the model is capturing the majority of the essential information from the reference summaries.

- **F1 score:** 0.9164, which is the harmonic mean of Precision and Recall. This score balances both Precision and Recall, indicating that the model performs well in generating summaries that are both accurate (Precision) and comprehensive (Recall).

### **SUMMARY** 
- The high **F1 score** shows that the model is effectively summarizing the content by maintaining a balance between including relevant information and avoiding unnecessary or irrelevant details.

- The **Precision** and **Recall** values suggest that the model is capturing a substantial portion of the content while keeping the information concise and relevant. The performance is impressive, even considering that some model weights were randomly initialized during the evaluation.

- While the model performs well, further fine-tuning, especially for the pooler layer, might lead to slight improvements in semantic understanding and overall summary quality.


### evaluating the model using METEOR

In [299]:
from transformers import BartTokenizer
from nltk.translate.meteor_score import meteor_score
from itertools import chain
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

tokenize the inputs

In [300]:
def tokenize_summary(entry):
    try:
        post_summary_json = entry.get('post_summary', '{}')
        post_summary_dict = json.loads(post_summary_json)
        title = post_summary_dict.get('title', '')
        selftext = post_summary_dict.get('selftext', '')
        text = f"{title} {selftext}"
        tokens = tokenizer.tokenize(text)
        return tokens
    except Exception as e:
        print(f"Error processing entry: {e}")
        print(f"Entry content: {str(entry)[:200]}...")
        return []

In [309]:
with open('../data/summary.json', 'r') as file:
    generated_summaries = json.load(file)   

with open('../data/reference.json', 'r') as file:
    reference_summaries = json.load(file)
    
with open('../data/reference2.json', 'r') as file:
    reference_summaries2 = json.load(file)

with open('../data/reference3.json', 'r') as file:
    reference_summaries3 = json.load(file)

In [None]:
generated_tokens = tokenize_summary(generated_summaries)
reference_tokens = tokenize_summary(reference_summaries)
reference_tokens2 = tokenize_summary(reference_summaries2)
reference_tokens3 = tokenize_summary(reference_summaries3)

In [312]:
reference_tokens_all = reference_tokens + reference_tokens2 + reference_tokens3

In [None]:
print(generated_summaries)
print(reference_summaries)

In [307]:
flattened_reference_tokens = list(chain.from_iterable(reference_tokens))
print("Flattened reference tokens:", flattened_reference_tokens)

Flattened reference tokens: ['Best', 'Ġbudget', '-', 'friendly', 'ĠEr', 'g', 'onomic', 'ĠChair', 'Ġwith', 'ĠAdjust', 'able', 'ĠHead', 'rest', ',', 'ĠArm', 'rest', 'Ġand', 'ĠL', 'umb', 'ar', 'ĠPill', 'ow', 'ĠThe', 'Ġuser', 'Ġis', 'Ġlooking', 'Ġfor', 'Ġa', 'Ġnew', 'Ġerg', 'onomic', 'Ġchair', 'Ġaround', 'ĠP', '3', ',', '000', '-', 'P', '4', ',', '000', 'Ġwith', 'Ġan', 'Ġadjustable', 'Ġhead', 'rest', ',', 'Ġarm', 'rest', 'Ġand', 'Ġl', 'umb', 'ar', 'Ġpillow', 'Ġmade', 'Ġwith', 'Ġmesh', 'Ġmaterial', '.', 'Best', 'Ġbudget', '-', 'friendly', 'ĠEr', 'g', 'onomic', 'ĠChair', 'Ġwith', 'ĠAdjust', 'able', 'ĠHead', 'rest', ',', 'ĠArm', 'rest', 'Ġand', 'ĠL', 'umb', 'ar', 'ĠPill', 'ow', 'Ġfor', 'ĠW', 'FH', 'ĠThe', 'Ġuser', 'Ġis', 'Ġlooking', 'Ġfor', 'Ġa', 'Ġnew', 'Ġerg', 'onomic', 'Ġchair', 'Ġaround', 'ĠP', '3', ',', '000', '-', 'P', '4', ',', '000', 'Ġwith', 'Ġan', 'Ġadjustable', 'Ġhead', 'rest', ',', 'Ġarm', 'rest', 'Ġand', 'Ġl', 'umb', 'ar', 'Ġpillow', 'Ġmade', 'Ġwith', 'Ġmesh', 'Ġmaterial', 'Ġfor'

calculate the score

In [314]:
meteor_score_value = meteor_score([reference_tokens_all], generated_tokens)

print(f"METEOR score: {meteor_score_value}")

METEOR score: 0.24830702899410625
