# EVAL USING ROGUE

prepare imports

In [2]:
from transformers import pipeline
import json
from rouge_score import rouge_scorer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


prepare functions

In [7]:
summarizer = pipeline(
            "summarization",
            model = "../models/model2",
            tokenizer = "../models/model2",
        )
        
def summarize(text, prompt):
    inputs = f"{prompt}: {text}"
    input_tokens = summarizer.tokenizer.encode(inputs, truncation=False)
    input_len = len(input_tokens)
    max_length = min(input_len * 2, 1024)
    min_length = max(32, input_len // 4)
    summary = summarizer(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
    )
    return summary[0]['summary_text']
    
def process_data(response, prompt):
    post_content = response[0]['data']['children'][0]['data'].get('selftext', '')
    comments = []
    for comment in response[1]['data']['children']:
        if 'body' in comment['data']:
            comments.append(comment['data']['body'])
    comments_all = ' '.join(comments)

    post_summary = summarize(post_content, prompt)
    comments_summary = summarize(comments_all, prompt)

    return {
        "post_summary": post_summary,
        "comments_summary": comments_summary
    }

Device set to use cuda:0


load the reddit post and summarize it then save the summary in another json file

In [8]:
with open('../response.json') as file:
    reddit_post = json.load(file)

summary = process_data(reddit_post, "Summarize and highlight popular brands")

with open('../summary.json', 'w') as file:
    json.dump(summary, file, indent=4)
print("Summary saved to summary.json")

Your max_length is set to 274, but your input_length is only 137. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 380, but your input_length is only 190. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)


Summary saved to summary.json


verify by printing the summaries

In [9]:
print("Post Summary:\n", summary["post_summary"])
print("\nComments Summary:\n", summary["comments_summary"])

Post Summary:
 {"title": "Budget for a New Chair with Adjustable Headrest and Armrest", "selftext": "The user is looking for a new chair with an adjustable headrest, armrest, chair height, and a lumbar pillow with mesh material. They mention a budget of P3,000-P4,000 and highlight popular brands.", "comments": ["Users provide feedback on the chair's specifications and suggest alternative options.", "Some users suggest reaching out to influencers for recommendations on speakerphones or headphones for better long-distance communication.", "The sentiment is positive, with users appreciating the user's effort to find a chair and offering helpful suggestions for its design."], "sentiment": " the sentiment is encouraging and supportive, with Users sharing their own search experiences and offering suggestions for additional features on chairs and speakers."}

Comments Summary:
 {"title": "Budget-friendly office chair suggestions", "selftext": {"overview": "The post provides a budget-friendly 

fix the json files (the output summary files are not formatted properly (json))

In [8]:
with open('../summary.json', 'r') as file:
    generated_summaries = file.read()

with open('../reference.json', 'r') as file:
    reference_summaries = file.read()

def fix_json(jsonfile, path):
    improper_json = jsonfile

    fixed_json = json.loads(improper_json)

    fixed_post_summary = json.loads(fixed_json['post_summary'])
    fixed_comments_summary = json.loads(fixed_json['comments_summary'])

    fixed_json['post_summary'] = fixed_post_summary
    fixed_json['comments_summary'] = fixed_comments_summary

    print(json.dumps(fixed_json, indent=4))

    with open(path, 'w') as file:
        json.dump(fixed_json, file, indent=4)