In [1]:
# load an article
import os
ABSOLUTE_PATH = os.getcwd()

import sys
ROOT_PATH = '/'.join(ABSOLUTE_PATH.split('/')[:-1])
sys.path.append(ROOT_PATH)


from main.document import Document
from pprint import pprint
input_file_path = f'{ROOT_PATH}/data/article.txt'
document = Document.load_from_local(input_file_path)
pprint(document.content)

('Teenagers in foster care in Scotland are being moved too often, according to '
 'a campaign group. Research carried out\n'
 'by the Fostering Network suggests almost half of fostered young people are '
 'already living with their third foster\n'
 'family since going into care. The group has warned that 750 more foster '
 'carers are \\"urgently\\" needed to meet the\n'
 'demands of the care system. It urged people to \\"open their hearts and '
 'homes\\" to vulnerable youngsters. Currently,\n'
 'more than 5,500 children are in foster care in Scotland, living with 4,400 '
 'families and carers. The Fostering Network\n'
 'surveyed 250 children, teenagers and foster carers across Scotland and '
 'discovered that many young people had failed\n'
 'to find stability. Almost half were already living with their third family, '
 'a quarter were with their fourth family\n'
 'and about 20 were living with their 10th family since going into care. There '
 'was a particular need for homes to be\n

In [2]:

# Common configuration
has_title = True                   # has a title
min_num_of_char_in_title = 32      # 32 <= title length <= 80 
max_num_of_char_in_title = 80
compression_rate = 0.3             # compression rate = 0.3
min_num_of_paragraph = 2           # 2 <= number of paragraphs <= 4
max_num_of_paragraph = 4

from openai import AsyncOpenAI
llm_api_client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="dummy_key")


from main.summarizer import BestHitLLMSummarizer
candidate_models = ["deepseek-r1:8b",
                   "qwen3-vl:8b",
                   "llama2:latest", ]
per_model_report = {}
for model in candidate_models:
    print(f'Trying model {model}')
    summarizer = BestHitLLMSummarizer(
        client=llm_api_client, 
        model=model,
        has_title=has_title,
        min_num_of_char_in_title=min_num_of_char_in_title,
        max_num_of_char_in_title=max_num_of_char_in_title,
        min_num_of_paragraph=min_num_of_paragraph,
        max_num_of_paragraph=max_num_of_paragraph,
        num_tries=3,
        llm_as_judge=False)
    
    report = await summarizer.summarize(document=document)
    per_model_report[model] = report

  from .autonotebook import tqdm as notebook_tqdm


Trying model deepseek-r1:8b
Summarize at iteration 0
Summarize at iteration 1
Summarize at iteration 2
Number of valid reports: 3




Valid report 1; Bert-Score-F1: 0.0; Roger-Score-F1: 0.0
Valid report 2; Bert-Score-F1: 0.6149623990058899; Roger-Score-F1: 0.36551724137931035
Valid report 3; Bert-Score-F1: 0.6435026526451111; Roger-Score-F1: 0.3652694610778443
Trying model qwen3-vl:8b
Summarize at iteration 0
Summarize at iteration 1
Summarize at iteration 2
Number of valid reports: 3
Valid report 1; Bert-Score-F1: 0.6124502420425415; Roger-Score-F1: 0.421875
Valid report 2; Bert-Score-F1: 0.6285790801048279; Roger-Score-F1: 0.40298507462686567
Valid report 3; Bert-Score-F1: 0.6187511682510376; Roger-Score-F1: 0.41
Trying model llama2:latest
Summarize at iteration 0
Summarize at iteration 1
Summarize at iteration 2
Number of valid reports: 3
Valid report 1; Bert-Score-F1: 0.6888040900230408; Roger-Score-F1: 0.6217391304347826
Valid report 2; Bert-Score-F1: 0.7325177192687988; Roger-Score-F1: 0.7955390334572491
Valid report 3; Bert-Score-F1: 0.6820167303085327; Roger-Score-F1: 0.6376811594202898


In [4]:
# Extract a bunch of metrics for evaluating those reports generated by different LLM models
from collections import defaultdict
from inspect import iscoroutinefunction

from main.metrics import HasTitleMetricExtractor
from main.metrics import TitleLengthMetricExtractor
from main.metrics import NumberOfParagraphMetricExtractor
from main.metrics import NumberOfTokenMetricExtractor
from main.llm_as_judge import Reference
from main.metrics import BertScoreMetricExtractor
from main.metrics import RougeScoreMetricExtractor
from main.metrics import CorrectnessMetricExtractor
from main.metrics import CompletenessMetricExtractor


model_used_for_evaluation = "llama2:latest"

metric_extractors = [HasTitleMetricExtractor(),
                     TitleLengthMetricExtractor(),
                     NumberOfParagraphMetricExtractor(),
                     NumberOfTokenMetricExtractor(),
                     BertScoreMetricExtractor(reference=Reference(content=document.content)),
                     RougeScoreMetricExtractor(reference=Reference(content=document.content)),
                     CorrectnessMetricExtractor(client=llm_api_client, 
                                                model=model_used_for_evaluation, 
                                                reference=Reference(content=document.content)),
                     CompletenessMetricExtractor(client=llm_api_client, 
                                                        model=model_used_for_evaluation, 
                                                        reference=Reference(content=document.content)),
                    ]

metrics = defaultdict(dict)
for model, report in per_model_report.items():
    print(f'evaluating model {model}')
    for metric_extractor in metric_extractors:
        print(f'Run {metric_extractor.__class__.__name__}')

        if iscoroutinefunction(metric_extractor.extract):
            metric = await metric_extractor.extract(report)
        else:
            metric = metric_extractor.extract(report)
            
        metrics[model][metric.name] = float(metric.value)
    print("--------------------------------------------------------------")
        

evaluating model deepseek-r1:8b
Run HasTitleMetricExtractor
Run TitleLengthMetricExtractor
Run NumberOfParagraphMetricExtractor
Run NumberOfTokenMetricExtractor
Run BertScoreMetricExtractor
Run RougeScoreMetricExtractor
Run CorrectnessMetricExtractor
Run CompletenessMetricExtractor
--------------------------------------------------------------
evaluating model qwen3-vl:8b
Run HasTitleMetricExtractor
Run TitleLengthMetricExtractor
Run NumberOfParagraphMetricExtractor
Run NumberOfTokenMetricExtractor
Run BertScoreMetricExtractor
Run RougeScoreMetricExtractor
Run CorrectnessMetricExtractor
Run CompletenessMetricExtractor
--------------------------------------------------------------
evaluating model llama2:latest
Run HasTitleMetricExtractor
Run TitleLengthMetricExtractor
Run NumberOfParagraphMetricExtractor
Run NumberOfTokenMetricExtractor
Run BertScoreMetricExtractor
Run RougeScoreMetricExtractor
Run CorrectnessMetricExtractor
Run CompletenessMetricExtractor
-----------------------------

In [21]:
# show metrics from different models
cols = [f'{"":<50}']
for model, _ in metrics.items():
    cols.append(f'{model:<30}')
print(''.join(cols))
print(f'----------------------------------------------------------------------------------------------------------------------------')


candidate_models = ["deepseek-r1:8b", "qwen3-vl:8b", "llama2:latest", ]
for name,_ in metrics["deepseek-r1:8b"].items():
    cols = [f'{name:<50}']
    for candidate_model in candidate_models:
        value = metrics[candidate_model][name]
        cols.append(f'{value:<30}')
    print(''.join(cols))

                                                  deepseek-r1:8b                qwen3-vl:8b                   llama2:latest                 
----------------------------------------------------------------------------------------------------------------------------
Has-Title-Metric                                  1.0                           1.0                           1.0                           
Number-Of-Chars-In-Title-Metric                   72.0                          883.0                         41.0                          
Number-Of-Paragraphs-Metric                       5.0                           4.0                           4.0                           
Number-Of-Tokens-Metric                           162.0                         125.0                         268.0                         
Bert-Score-Metric                                 0.6435026526451111            0.6124502420425415            0.7325177192687988            
Rouge-Score-Metric           