# Prepare tasks for human salience annotation study

In [1]:
import pandas as pd
import json

## Utilities

Load datasets

In [2]:
DATASETS = {
    dataset: pd.read_json(f'../output/{dataset}/discord_answers.json')
    for dataset in ['pubmed-sample', 'cs-cl', 'astro-ph', 'qmsum-generic']
}

In [None]:
def display_docs_with_all_questions_answered(dataset):
    df_answers = DATASETS[dataset]
    n_question = df_answers['question'].nunique()
    answered_by_doc = df_answers[df_answers['reference_answer'] != 'no answer'].groupby('doc_id').size()
    filtered = answered_by_doc[answered_by_doc == n_question]
    display(filtered)
    return filtered


def display_qa_sample(dataset, doc_id, n):
    df_answers = DATASETS[dataset]
    with pd.option_context('display.max_colwidth', None):
        display(df_answers[df_answers['doc_id'] == doc_id].head(n))

## Select sample Q/A for ranking/rating tasks

In [4]:
dataset = 'pubmed-sample'
display_docs_with_all_questions_answered(dataset)
display_qa_sample(dataset, doc_id=38221859, n=-1)

doc_id
38206144    21
38221859    21
38232817    21
38236440    21
38327148    21
38407394    21
dtype: int64

Unnamed: 0,doc_id,cluster_id,question,reference_answer
2835,38221859,1,What is the main focus of the study?,Assessing the efficacy and safety of prusogliptin (DBPR108) in individuals with type 2 diabetes.
2836,38221859,10,Which patient population is the study concerned with?,Adult patients with type 2 diabetes who had not been using glucose-lowering agents regularly for the 8 weeks before the screening period.
2837,38221859,9,What condition is being addressed in the study?,Type 2 diabetes.
2838,38221859,18,What is the participant demographic or characteristics in the study?,"The participants were adult patients with type 2 diabetes, with a mean age of 54.3 years, 58% were men, and the median duration of type 2 diabetes was 0.38 years."
2839,38221859,36,What was the main intervention used in the study?,"DBPR108 100 mg, sitagliptin 100 mg, and placebo were given once daily to the patients."
2840,38221859,31,What are the significant benefits of the intervention?,"The intervention (DBPR108) significantly reduced glycated haemoglobin (HbA1c) levels, fasting and postprandial plasma glucose levels, and had little effect on body weight."
2841,38221859,6,What are the specific biological markers influenced by the intervention?,"Glycated haemoglobin (HbA1c), fasting plasma glucose, and postprandial plasma glucose levels."
2842,38221859,34,What specific treatments were compared in the study?,"DBPR108 100 mg, sitagliptin 100 mg, and placebo."
2843,38221859,23,What specific metrics or outcomes were measured?,"The specific metrics or outcomes measured included the mean change in glycated haemoglobin (HbA1c) levels, mean treatment difference between groups, fasting and postprandial plasma glucose levels, changes in body weight, and the incidence of adverse events."
2844,38221859,28,What are secondary outcomes noted in the study?,"The study notes the reduction in fasting and postprandial plasma glucose levels as a result of DBPR108 treatment, and it also notes that DBPR108 had little effect on body weight."


In [5]:
dataset = 'qmsum-generic'
display_docs_with_all_questions_answered(dataset)
display_qa_sample(dataset, 'Product-train-74', n=-1)

doc_id
Academic-test-7       10
Academic-train-12     10
Academic-train-13     10
Academic-train-16     10
Academic-train-17     10
Academic-train-19     10
Academic-train-24     10
Academic-train-26     10
Academic-train-3      10
Academic-train-36     10
Academic-train-5      10
Academic-train-6      10
Academic-val-4        10
Committee-test-0      10
Committee-train-1     10
Committee-train-11    10
Committee-train-18    10
Committee-train-19    10
Committee-train-23    10
Committee-train-4     10
Committee-train-8     10
Committee-train-9     10
Committee-val-2       10
Committee-val-3       10
Product-test-14       10
Product-train-1       10
Product-train-53      10
Product-train-54      10
Product-train-63      10
Product-train-66      10
Product-train-74      10
Product-train-95      10
dtype: int64

Unnamed: 0,doc_id,cluster_id,question,reference_answer
670,Product-train-74,9,Who are the participants and their roles discussed in the meeting?,"The participants are Matthew (User Interface), Anna (Marketing), Nanne (Project Manager), and Mael (Industrial Designer). Their roles are User Interface, Marketing, Project Manager, and Industrial Designer, respectively."
671,Product-train-74,4,What main topic was discussed in the meeting?,"The main topic of the meeting was the new remote control project, specifically designing a new remote control for a television set, including its functionality, user interface, and market requirements."
672,Product-train-74,2,What were the main objectives or goals discussed in the meeting?,"The main objectives or goals discussed in the meeting were to introduce the new project of designing a remote control, to understand the roles of each team member, to learn how to use the whiteboard tool, to brainstorm and gather ideas for the remote control design, and to set clear goals and expectations for the project."
673,Product-train-74,10,Which aspects of the main topic were covered in the discussion?,"The main topic was about designing a new remote control for a television set. Aspects covered in the discussion include the user requirements, such as having a user-friendly interface, a browsing function, volume and channel control, and possibly internet functionality. The team also discussed the importance of the remote control being original, trendy, and fitting within a budget of 25 Euros, with a target profit of 50 million Euros."
674,Product-train-74,15,What are the identified challenges or concerns discussed?,"The challenges or concerns discussed include:\n\n- The microphone cord not working or being cumbersome (INDUSTRIAL DESIGNER, USER INTERFACE, PROJECT MANAGER)\n- The need for the remote control to be original, trendy, and user-friendly (PROJECT MANAGER)\n- The need to keep the production cost at a maximum of 12.50 Euro (PROJECT MANAGER)\n- Selling at least 4 million units to make a profit (USER INTERFACE)\n- Difficulty in guessing the animal drawn by MARKETING (all team members)\n- The need to consider the user requirements before designing the remote control (INDUSTRIAL DESIGNER, MARKETING)\n- The need to balance between adding features and keeping the remote control simple and user-friendly (MARKETING, PROJECT MANAGER)\n- The desire to design a new remote control rather than partnering with a manufacturer (INDUSTRIAL DESIGNER)"
675,Product-train-74,14,What detailed strategies or solutions were proposed for the challenges discussed?,"The team discussed several challenges for designing a new remote control, including:\n- Making it original, trendy, and user-friendly\n- Ensuring it has a good balance of functionality and affordability, with a goal of selling it at a price of 25 Euros\n- Providing a browsing function for channels\n- Incorporating common features such as volume and channel controls, teletext, and possibly internet capabilities\n- Leaving out unnecessary features to keep the price low\n- Designing a new remote control rather than collaborating with existing manufacturers\n\nFor future development, the team decided to:\n- Work on a general design for the remote control, keeping it flexible for potential additions of new functions\n- Focus on technical function design\n- Consider user requirements, such as whether the remote control needs internet capabilities or should stay with a basic TV interface\n- Communicate through email and other forms of communication before the next meeting."
676,Product-train-74,7,What were the anticipated impacts or implications discussed?,"The anticipated impacts or implications discussed include the need for the new remote control to be original, trendy, user-friendly, and to sell at least four million units to make a profit of 50 million Euros. Additionally, the production cost should be no more than 12.50 Euros, and the selling price is expected to be 25 Euros. The remote control should have the expected functionality of a remote control, including browsing channels, adjusting volume, and changing channels, with the main controls being easy to use and obvious."
677,Product-train-74,17,What were the major outcomes or decisions made during the meeting?,"- The team will work on designing a new remote control for television sets.\n- The remote control should be original, trendy, and user-friendly.\n- The target price of the product is 25 Euros.\n- The production cost must be no more than 12.50 Euros.\n- The team will aim to sell at least 4 million units to make a profit.\n- The remote control should have basic functions such as browsing channels, changing volume, and accessing teletext.\n- The team will work on the technical function design and requirements in the next meeting.\n- The next meeting will be in 30 minutes.\n- The team will be informed via email and other forms of communication about the next steps."
678,Product-train-74,3,What collaborative efforts or partnerships were discussed?,"The team discussed the possibility of partnering with TV manufacturers to sell their product, and Matthew suggested that they could sell their product together with the TV manufacturers' product."


In [6]:
dataset = 'cs-cl'
display_docs_with_all_questions_answered(dataset)
display_qa_sample(dataset, '2410.11996v1', n=-1)

doc_id
2410.10801v1    14
2410.11996v1    14
2410.12292v2    14
2410.12329v1    14
2410.12341v1    14
2410.12476v1    14
2410.12662v1    14
2410.12893v1    14
2410.12934v1    14
2410.13085v1    14
2410.13191v2    14
2410.13351v1    14
2410.13825v1    14
2410.14157v1    14
2410.14179v1    14
2410.14235v1    14
2410.14609v1    14
dtype: int64

Unnamed: 0,doc_id,cluster_id,question,reference_answer
672,2410.11996v1,15,What main topic is the document addressing?,"The main topic is the evaluation and improvement of Long-Context Language Models (LCLMs) for tasks requiring understanding and processing extended contexts, including the design of a new benchmark to assess their holistic reasoning capabilities."
673,2410.11996v1,23,What are the main approaches or techniques discussed in the document?,"The main approaches or techniques discussed are Neural Databases, Long-context Language Model Benchmarks, and Retrieval-Augmented Generation (RAG)."
674,2410.11996v1,12,What recent advancements or innovations are highlighted in the document?,"Thorne et al. (2021) proposed a neural database architecture, Trappolini et al. (2023) extended neural databases to multimodal data, and several studies (Zhang et al., Lee et al., Hsieh et al., Kuratov et al., Karpinska et al., Laban et al., Levy, Jacoby, and Goldberg, and Zhang et al.) introduced benchmarks to evaluate LCLMs on tasks requiring understanding and processing extended contexts."
675,2410.11996v1,8,How does the study relate to previous research in the field?,"The study aims to address limitations of previous benchmarks and existing research, including those of Thorne et al. (2021) and Trappolini et al. (2023), by providing a more comprehensive evaluation of LCLMs' holistic reasoning capabilities and tackling the challenges of long-context understanding."
676,2410.11996v1,20,Which previous works or studies are referenced?,"Thorne et al. (2021), Trappolini et al. (2023), Kamradt (2023), Lee et al. (2024), Hsieh et al. (2024), Kuratov et al. (2024), Karpinska et al. (2024), Laban et al. (2024), Levy, Jacoby, and Goldberg (2024), Zhang et al. (2024), En.QA in `\infty\text{BENCH}`, Lewis et al. (2020), Z. Li et al. (2024), and Tan Yu, Xu, and Akkiraju (2024)."
677,2410.11996v1,25,What is a prominent method mentioned for enhancing model effectiveness?,"Retrieval-Augmented Generation (RAG) is a prominent method mentioned for enhancing model effectiveness, combining retrieval-based methods and generative language models to enhance the generation of accurate and contextually relevant responses."
678,2410.11996v1,14,What challenge or gap is identified in the research?,"The challenge or gap identified in the research is that existing benchmarks for long-context language models (LCLMs) predominantly focus on localized tasks or summarization, do not fully assess models' abilities to handle complex reasoning tasks over vast amounts of information, and are either prohibitively expensive to create or prone to annotation errors."
679,2410.11996v1,30,What improvements or contributions do the proposed methods make?,"The proposed methods aim to evaluate LCLMs' holistic reasoning capabilities by providing a more comprehensive evaluation of their ability to synthesize and integrate vast amounts of information distributed across extended contexts in an automated and scalable manner. They also aim to address the limitations of existing benchmarks, which predominantly focus on localized tasks or summarization, and provide a more comprehensive evaluation of models' abilities to handle complex reasoning tasks over vast amounts of information in lengthy documents."
680,2410.11996v1,11,What are the new approaches or methods proposed to address the challenges?,A corpus generation framework that can systematically vary the complexity and distribution of information is proposed to address the limitations of existing benchmarks and provide a more comprehensive evaluation of LCLMs' holistic reasoning capabilities.
681,2410.11996v1,27,What are the main methods or techniques evaluated in the study?,"Long-context language models (LCLMs), Retrieval-Augmented Generation (RAG), and neural databases are the main methods or techniques evaluated in the study."


In [7]:
dataset = 'astro-ph'
filtered = display_docs_with_all_questions_answered(dataset)
display_qa_sample(dataset, '2407.16492v1', n=-1)
# print()

# for doc_id in pd.Series(filtered.index).sample(10, random_state=42):
#     print(doc_id)
#     display_qa_sample(dataset, doc_id, n=-1)

doc_id
0902.3117v1           13
0903.0636v1           13
1001.3420v3           13
1006.2763v2           13
1111.1527v2           13
                      ..
astro-ph/0305601v2    13
astro-ph/0502553v1    13
astro-ph/0512503v2    13
astro-ph/0601350v1    13
astro-ph/0703199v1    13
Length: 67, dtype: int64

Unnamed: 0,doc_id,cluster_id,question,reference_answer
962,2407.16492v1,0,What is the main focus of the study?,"The main focus of the study is to test cosmic evolution of SNe Ia, specifically to quantify systematics from any evolution of intrinsic properties with the age of the universe, which is crucial for precision probes of dark energy."
963,2407.16492v1,10,What specific phenomena or processes are being investigated in the study?,The study investigates the following phenomena or processes:\n\n- Evolution of intrinsic properties of SNe Ia with the age of the universe\n- Systematics related to dark energy inference\n- Cosmic evolution of SNe Ia\n- Progenitor channels of SNe Ia\n- Differences in spectral properties between low-$z$ and high-$z$ SNe Ia\n- Differences in line velocities between intermediate- and high-$z$ SNe Ia and low-$z$ SNe Ia\n- Differences in UV features between local and distant SNe Ia\n- The feasibility of using strongly lensed supernovae to test cosmic evolution of SNe Ia and control systematics in dark energy inference.
964,2407.16492v1,17,What broader context or field does the study contribute to or address?,"The study contributes to the field of cosmology, specifically the understanding of Type Ia supernovae (SNe Ia) and their role as precision probes of dark energy. It also addresses the context of understanding the evolution of SNe Ia with the age of the universe and their progenitor channels."
965,2407.16492v1,3,What specific challenges or limitations does the study address or identify?,"The study identifies the following specific challenges or limitations: \n- The future need for observations with G140M / F070LP to compare UV features at 2920 Å and 3180 Å between local and very high-z gLSNe Ia.\n- The need for future observations to test whether differences in the feature around 3500 Å are also seen at later phases.\n- The limitation of the current study not covering such blue features, which were previously found to be depressed in local SNe Ia compared to more distant SNe Ia."
966,2407.16492v1,8,What methodology or techniques are employed in the study?,The study employed the following methodologies and techniques:\n- SNID template matching for subclassification of the supernova.\n- Template matching to find the best fit phase of observations.\n- Binning of composite spectra based on the lightcurve shape.\n- Comparison with the mean low-z spectrum in the same wavelength region.\n- Quantification of spectral line velocities of the supernova.\n- Comparison with predictions from different explosion models to infer progenitor channels.\n- Use of the kaepora relational database for building a composite spectrum of low-z SNe Ia.\n- Use of spectra to compare with model scenarios.
967,2407.16492v1,6,What comparisons are made within the study?,"The study compares SN Encore to low-$z$ SNe Ia in terms of their spectra and lightcurve properties. Specifically, it compares the spectral properties of SN Encore to low-$z$ SNe Ia at post-maximum light, and to a low-$z$ sample binned by lightcurve shape. It also compares the spectral line velocities of SN Encore, SN H0pe, and iPTF16geu to those of low-$z$ SNe Ia. Additionally, it compares the spectra of SN Encore to predictions from different explosion models ($M_{\rm ch}$ and sub-$M_{\rm ch}$ models)."
968,2407.16492v1,15,What are the main findings of the study?,"The main findings of the study are: \n- There is no evidence for differences between the spectral properties of the high-z SN Ia SN Encore and low-z SNe Ia based on post-maximum light spectra.\n- The line velocities measured for intermediate- and high-z SNe Ia are consistent with the distribution observed for low-z SNe Ia.\n- Sub-Mch models underpredict the features near ~6700 Å, while DDC and PDDEL models overpredict the complex at ~5500 Å.\n- The study does not find significant differences in the spectrum of SN Encore compared to low-z SNe Ia in the same wavelength region, except for the slowest evolving subsample."
969,2407.16492v1,14,What detailed evidence or data is used to support the study's claims?,"The study uses the following detailed evidence or data to support its claims:\n- Analysis of the spectra of SN Encore, a lensed Type Ia supernova at z=1.95, indicating it is consistent with a normal SN Ia subclassification from SNID template matching.\n- Fit phase of observations at 29.0 ± 5.0 d for the G235M spectrum and 37.4 ± 2.8 d for the G140M spectrum, consistent with time-dilation expectations.\n- Composite spectrum of low-z SNe Ia from the kaepora relational database, which shows no significant differences between the local sample and either of the SN Encore spectra.\n- Binned composite spectra based on lightcurve shape, with all except the slowest-evolving subsample matching the observed spectrum.\n- Comparison of the G140M observations of SN Encore with the mean low-z spectrum in the same wavelength region, showing no differences.\n- Analysis of UV spectra of low- and intermediate-z SNe Ia, which previously showed evidence for depressed flux in the local SNe Ia compared to more distant SNe Ia in specific features (2920 Å and 3180 Å).\n- Spectral line velocities of SN Encore, SN H0pe, and iPTF16geu, which are consistent with the distribution observed for low-z SNe Ia.\n- Comparison of SN Encore's spectrum with predictions from different explosion models, including Mch, sub-Mch, DDC, and PDDEL models."
970,2407.16492v1,12,What specific variables or conditions are crucial in the study's findings?,"The study's findings are crucially dependent on two conditions: \n1. The lookback time of the supernova SN Ia, which is more than 10 Gyr.\n2. The lensing magnification caused by the foreground object, which allows for high-quality spectra to be taken a few restframe weeks after maximum light.\n3. The comparison between the observed spectrum and the spectra of low-z SNe Ia, as well as the composite spectra based on the lightcurve shape.\n4. The evaluation of spectral line velocities of SN Encore and other lensed SNe Ia, which are found to be consistent with the distribution observed for low-z SNe Ia.\n5. The comparison with predictions from different explosion models, specifically the M_ch and sub-M_ch models, which are found to underpredict and overpredict certain features in the spectra."
971,2407.16492v1,18,How do the findings relate to existing models or theories?,"The findings indicate that sub-$`M_{\rm Ch}`$ models underpredict the features near $`\sim 6700`$ Å and that DDC and PDDEL models overpredict the complex at $`\sim 5500`$ Å , with DDC models also predicting the strongest feature at $`\sim 9000`$ Å . These comparisons suggest that the observed spectra of SN Encore can be used to distinguish between different model scenarios, but future improvements in the model predictions are needed."


## Export tasks

In [None]:
def get_ranking_task(dataset, doc_id, metadata):
    df_answers = DATASETS[dataset]
    df_filtered = df_answers[df_answers['doc_id'] == doc_id]
    df_filtered = df_filtered.sample(frac=1.0, random_state=2)  # shuffle

    task = {
        **metadata,
        "items": [
            {
                'id': row['cluster_id'],
                'title': row['question'],
                'body': row['reference_answer']
            }
            for row in df_filtered.to_dict(orient='records')
        ]
    }
    return task

In [9]:
selected = [
    ('pubmed-sample', 38221859),
    ('cs-cl', '2410.11996v1'),
    ('astro-ph', '2407.16492v1'),
    ('qmsum-generic', 'Product-train-74'),
]

In [10]:
tasks = []
for dataset, doc_id in selected:
    metadata = {
        'dataset': dataset,
        'doc_id': doc_id,
    }
    task = get_ranking_task(dataset, doc_id, metadata)
    tasks.append(task)

print(len(tasks))

print(json.dumps(tasks, indent=4))

4
[
    {
        "dataset": "pubmed-sample",
        "doc_id": 38221859,
        "items": [
            {
                "id": 11,
                "title": "What was the study design or setting of the trial?",
                "body": "This multicentre, randomized, double-blind, phase 3 study."
            },
            {
                "id": 6,
                "title": "What are the specific biological markers influenced by the intervention?",
                "body": "Glycated haemoglobin (HbA1c), fasting plasma glucose, and postprandial plasma glucose levels."
            },
            {
                "id": 1,
                "title": "What is the main focus of the study?",
                "body": "Assessing the efficacy and safety of prusogliptin (DBPR108) in individuals with type 2 diabetes."
            },
            {
                "id": 29,
                "title": "What is the main outcome or effect observed?",
                "body": "The main outcome observed is the 