In [7]:
import copy
import json
import openai
import os
import pandas as pd
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import Dict, List, Literal

In [8]:
## Abstract Summarizer
class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file=".env", env_file_encoding="utf-8", extra="ignore"
    )
    openai_api_key: str
settings = Settings()

In [None]:
SYSTEM_MESSAGE = '''You will be given the 'Abstract' section text extracted from a paper.
The paper is from a NLP conference. Summarize the text by following the instructions'''

USER_TEMPLATE = '''Given the 'Abstract' text, do the following
- summary: summarize what the paper is about in 1 concise sentence
- points: bullet points of important details from this paper
- topics: list of topics that describe what the paper is about. topic should be one of ["RAG", "retrieval", "information_extraction", "fact_check", "nli", "explanable_ai", "generation", "understanding"]

Title: <|title|>
Abstract:
<|abstract|>

Return in JSON like the following
{
    "summary": "...",
    "points": ["...", ...]
}'''

class SummaryResult(BaseModel):
    summary: str
    points: List[str]
    topics: List[Literal[["RAG", "retrieval", "information_extraction", "fact_check", "nli", "explanable_ai", "generation", "understanding"]]]
    
def fill_template(template: str, contents: Dict[str, str]) -> str:
    message = copy.deepcopy(template)
    for k,v in contents.items():
        message = message.replace(f"<|{k}|>", v)
    return message
    
class Summarizer:
    model: str
    def __init__(self, model = "gpt-4o-mini-2024-07-18"):
        self.client = openai.OpenAI(api_key = settings.openai_api_key)
        self.model = model
    
    def chat(self, messages) -> SummaryResult:
        response = self.client.beta.chat.completions.parse(
            model = self.model,
            messages=messages,
            response_format = SummaryResult,
            temperature = 0.1,
            # max_completion_tokens = 8192
        )
        result_dict = json.loads(response.choices[0].message.content)
        result = SummaryResult(**result_dict)
        return result
    
    def summarize(self, title: str, abstract: str) -> SummaryResult:
        user_message = fill_template(USER_TEMPLATE, {"title": title, "abstract": abstract})
        messages = [
            {"role": "system", "content": SYSTEM_MESSAGE},
            {"role": "user", "content": user_message}
        ]
        result = self.chat(messages)
        return result

In [29]:
summarizer = Summarizer()

In [30]:
title = "Dense X Retrieval: What Retrieval Granularity Should We Use?"
abstract = "Dense retrieval has become a prominent method to obtain relevant context or world knowledge in open-domain NLP tasks. When we use a learned dense retriever on a retrieval corpus at inference time, an often-overlooked design choice is the retrieval unit in which the corpus is indexed, e.g. document, passage, or sentence. We discover that the retrieval unit choice significantly impacts the performance of both retrieval and downstream tasks. Distinct from the typical approach of using passages or sentences, we introduce a novel retrieval unit, proposition, for dense retrieval. Propositions are defined as atomic expressions within text, each encapsulating a distinct factoid and presented in a concise, self-contained natural language format. We conduct an empirical comparison of different retrieval granularity. Our experiments reveal that indexing a corpus by fine-grained units such as propositions significantly outperforms passage-level units in retrieval tasks. Moreover, constructing prompts with fine-grained retrieved units for retrieval-augmented language models improves the performance of downstream QA tasks given a specific computation budget."
result = summarizer.summarize(title=title, abstract=abstract)

BadRequestError: Error code: 400 - {'error': {'message': "Invalid schema for response_format 'SummaryResult'. Please ensure it is a valid JSON Schema.", 'type': 'invalid_request_error', 'param': 'response_format', 'code': None}}

In [None]:
summary = result.summary
points = result.points
topics = result.topics

print("SUMMARY")
print(summary)
print('-'*30)
print("Points")
for point in points:
    print(f"* {point}")
print('-'*30)
print("Topics:")
print(topics)

SUMMARY
This paper investigates the impact of retrieval granularity on dense retrieval performance in NLP tasks, introducing propositions as a novel retrieval unit that enhances retrieval and downstream task outcomes.
------------------------------
Points
* Dense retrieval is crucial for obtaining relevant context in open-domain NLP tasks.
* The choice of retrieval unit (document, passage, sentence) significantly affects performance.
* The paper introduces 'propositions' as a new retrieval unit, defined as atomic expressions encapsulating distinct factoids.
* Empirical comparisons show that using propositions outperforms traditional passage-level indexing in retrieval tasks.
* Fine-grained retrieval units improve performance in downstream QA tasks when used with retrieval-augmented language models.
------------------------------
Topics:
['retrieval', 'information_extraction', 'understanding']
