In [1]:
import os
import json
from keys import (
    KEY1,
    LOCATION,
    API_VERSION,
    ENDPOINT,
    DEPLOYMENT_GPT_o3_MINI
)
from llm_client import LLMClient

### Helper functions

In [2]:
def clean_list(alist):
    if ".DS_Store" in alist:
        alist.remove(".DS_Store")
    return alist

def save_flaws(flaws_file, flaws_list, flaws_dir):
    file_path = os.path.join(flaws_dir, flaws_file)
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(flaws_list, f, ensure_ascii=False, indent=2)

def load_review_files(reviews_dir):
    paper_reviews = clean_list(os.listdir(reviews_dir))
    return paper_reviews

def load_review(reviews_dir, review_file):
    file_path = os.path.join(reviews_dir, review_file)
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


### Prompt

In [3]:
prompt = """
You are an expert academic reviewer with extensive experience in synthesizing peer reviews and rebuttals for top-tier machine learning conferences like ICLR. Given multiple peer reviews of an academic paper, along with the corresponding author rebuttals and reviewer replies, your task is to extract a **consensus list of actionable flaws or improvement points**.

Please follow these guidelines:
1. Identify flaws, limitations, or suggestions for improvement raised by reviewers.
2. Prioritize those where **authors acknowledge** the issue — either by:
   - Agreeing with the flaw and planning to address it in future work or a camera-ready version.
   - Acknowledging the issue but providing a reason for not addressing it now (e.g., limited time, compute, or out-of-scope).
3. Ignore points of disagreement or unclear consensus between reviewers and authors.

Your output should be a **python list of actionable items**, each:
- Clear, specific, and concise.
- Based on mutual understanding between reviewers and authors.
- Reflecting consensus, even if the change won't be implemented immediately.


Now analyze the following reviews and rebuttals:\n\n\n
"""


In [4]:


def get_flaws_from_llm(prompt, reviews_list):
    combined_review = ""
    for i, review in enumerate(reviews_list):
        combined_review += f"{'-'*50}\nReview {i+1}\n{'-'*50}\n{review}\n\n"
    
    messages = [
        {"role": "system", "content": "You are an expert academic reviewer with extensive experience in synthesizing peer reviews and rebuttals for top-tier machine learning conferences like ICLR. Your goal is to extract a consensus list of actionable improvements agreed upon by both reviewers and authors."},
        {"role": "user", "content": prompt + combined_review}
    ]

    llm_c = LLMClient(
        api_key=KEY1,
        api_version=API_VERSION,
        endpoint=ENDPOINT,
        deployment=DEPLOYMENT_GPT_o3_MINI
    )
    llm_resp = llm_c.get_llm_response(messages=messages)
    return llm_resp
    

### Run Code

In [None]:
def generate_flaws(review_files, reviews_dir, flaws_dir):
    for review_file in review_files:
        try:
            review = load_review(reviews_dir, review_file)
            flaws = get_flaws_from_llm(prompt, review)
            if flaws:
                print(flaws)
                # save_flaws(review_file, flaws, flaws_dir)
            else:
                raise ValueError(f"[-] Failed to process: {review_file}")
        except Exception as e:
            print(e)
        break  # remove break if you want to process multiple papers. For testing using break to break the loop after one paper

In [None]:
accepted_review_files = load_review_files("ICLR2024_Reviews_Raw/accepted")
rejected_review_files = load_review_files("ICLR2024_Reviews_Raw/rejected")

# Uncomment these two lines when everything is ready and you want to run the whole notebbok to generate flaws for all papers
# generate_flaws(accepted_review_files, "ICLR2024_Reviews_Processed/accepted", "ICLR2024_Reviews_Flaws/accepted")
# generate_flaws(rejected_review_files, "ICLR2024_Reviews_Processed/rejected", "ICLR2024_Reviews_Flaws/rejected")

In [7]:
generate_flaws(accepted_review_files, "ICLR2024_Reviews_Processed/accepted", "ICLR2024_Reviews_Flaws/accepted")

[
    "Reword the opening abstract and introduction to more accurately describe related work and clarify that while there is existing literature on causality (including in dynamic settings), the specific challenges of dynamical causal discovery have not been sufficiently addressed.",
    "Improve the clarity of key definitions and technical language—e.g. explain terms like 'filtered probability space', 'sticky value', and the precise conditions (e.g., smoothness and limit existence) underlying the formal statements—and add appropriate textbook or literature references (including for process‐based causality axioms and counterfactual frameworks).",
    "Revise grammatical phrasing and sentence structure throughout the paper (including examples such as the counterfactual statement on cause and effect) to improve readability and reduce potential ambiguities.",
    "Clarify and update notation inconsistencies, including the explicit dependency of state evolution on actions (e.g. in X(t) and