# Sample

In [2]:
import json
import os
import pandas as pd
from collections import defaultdict
import re

def load_json_file(filepath):
    """Safely loads a single JSON file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError, UnicodeDecodeError) as e:
        print(f"Warning: Could not read or parse {filepath}: {e}")
        return None

def collect_metareview_data(metareviews_path, venue_folder_name):
    """
    Collects 'is_flaw_mentioned' and 'mention_reasoning' from metareview JSON files.
    """
    mention_data = {}
    print(f"Scanning for metareview data in: {metareviews_path}")

    if not os.path.exists(metareviews_path):
        print(f"Warning: Metareviews directory not found at {metareviews_path}")
        return mention_data

    for model_name in ['o3']:# os.listdir(metareviews_path):
        model_path = os.path.join(metareviews_path, model_name)
        if os.path.isdir(model_path):
            venue_path = os.path.join(model_path, venue_folder_name)
            if os.path.isdir(venue_path):
                for status in ['accepted', 'rejected']:
                    status_path = os.path.join(venue_path, status)
                    if os.path.isdir(status_path):
                        for filename in os.listdir(status_path):
                            if filename.endswith(".json"):
                                data = load_json_file(os.path.join(status_path, filename))
                                if not data:
                                    continue
                                for paper_key, flaws in data.items():
                                    openreview_id = paper_key.split('_')[0]
                                    for flaw in flaws:
                                        flaw_id = flaw.get('flaw_id')
                                        if openreview_id and flaw_id:
                                            key = (openreview_id, flaw_id)
                                            mention_data[key] = {
                                                'is_flaw_mentioned': flaw.get('is_flaw_mentioned'),
                                                'mention_reasoning': flaw.get('mention_reasoning')
                                            }
    print(f"Collected mention data for {len(mention_data)} flaws.")
    return mention_data

def collect_llm_review_data(reviews_path, venue_folder_name):
    """
    Collects the full LLM review content from individual review JSON files.
    """
    review_data = {}
    print(f"Scanning for LLM review data in: {reviews_path}")

    if not os.path.exists(reviews_path):
        print(f"Warning: Reviews directory not found at {reviews_path}")
        return review_data

    for model_name in ['o3']: #os.listdir(reviews_path):
        model_path = os.path.join(reviews_path, model_name)
        if os.path.isdir(model_path):
            venue_path = os.path.join(model_path, venue_folder_name)
            if os.path.isdir(venue_path):
                for status in ['accepted', 'rejected']:
                    status_path = os.path.join(venue_path, status)
                    if not os.path.isdir(status_path): continue
                    for paper_folder in os.listdir(status_path):
                        paper_folder_path = os.path.join(status_path, paper_folder)
                        if not os.path.isdir(paper_folder_path): continue
                        
                        openreview_id = paper_folder.split('_')[0]
                        for filename in os.listdir(paper_folder_path):
                            if filename.endswith("_review.json"):
                                # Extract flaw_id from filename
                                # e.g., 0aN7VWwp4g_2410_23159_incorrect_csi_thresholds_review.json
                                # -> incorrect_csi_thresholds
                                match = re.match(r'(.+?)_(\d+_\d+)_(.+)_review\.json', filename)
                                if match:
                                    flaw_id = match.group(3)
                                else:
                                    # Fallback for different naming
                                    base_name = filename.replace('_review.json', '')
                                    # Assuming the last part is the flaw id
                                    flaw_id = '_'.join(base_name.split('_')[3:])

                                if not flaw_id: continue
                                
                                data = load_json_file(os.path.join(paper_folder_path, filename))
                                if data is not None:
                                    key = (openreview_id, flaw_id)
                                    review_data[key] = {'llm_review': json.dumps(data, indent=2)}
    
    print(f"Collected LLM reviews for {len(review_data)} flaws.")
    return review_data

def create_aggregated_dataset(venue_folder_name, base_data_dir, categorized_data_dir, output_filename):
    """
    Main function to orchestrate the data aggregation process.
    """
    # Define paths to the different data sources
    flawed_papers_dir = os.path.join(base_data_dir, 'flawed_papers', venue_folder_name)
    metareviews_dir = os.path.join(base_data_dir, 'metareviews')
    reviews_dir = os.path.join(base_data_dir, 'reviews')

    # --- 1. Load Base Data: Categories and Descriptions ---
    print("Step 1: Loading base data...")
    # Load categorized flaws (openreview_id, flaw_id, category_ids)
    categories_path = os.path.join(categorized_data_dir, 'flawed_papers', venue_folder_name, 'categorized_flaw_cleaned.csv')
    try:
        categories_df = pd.read_csv(categories_path)
    except FileNotFoundError:
        print(f"Error: Base category file not found at {categories_path}. Cannot proceed.")
        return

    # Load flaw descriptions
    descriptions_path = os.path.join(flawed_papers_dir, 'flawed_papers_global_summary.csv')
    try:
        descriptions_df = pd.read_csv(descriptions_path)[['openreview_id', 'flaw_id', 'flaw_description']]
    except FileNotFoundError:
        print(f"Warning: Flaw description file not found at {descriptions_path}.")
        descriptions_df = pd.DataFrame(columns=['openreview_id', 'flaw_id', 'flaw_description'])

    # Merge categories and descriptions
    df = pd.merge(categories_df, descriptions_df, on=['openreview_id', 'flaw_id'], how='left')
    print(f"Loaded and merged base data. Shape: {df.shape}")

    # --- 2. Collect and Merge Metareview Data ---
    print("\nStep 2: Collecting metareview data...")
    mention_data = collect_metareview_data(metareviews_dir, venue_folder_name)
    mention_df = pd.DataFrame.from_dict(mention_data, orient='index').reset_index()
    mention_df.rename(columns={'level_0': 'openreview_id', 'level_1': 'flaw_id'}, inplace=True)
    df = pd.merge(df, mention_df, on=['openreview_id', 'flaw_id'], how='left')
    print(f"Data shape after merging mention data: {df.shape}")

    # --- 3. Collect and Merge LLM Review Data ---
    print("\nStep 3: Collecting LLM review data...")
    review_data = collect_llm_review_data(reviews_dir, venue_folder_name)
    review_df = pd.DataFrame.from_dict(review_data, orient='index').reset_index()
    review_df.rename(columns={'level_0': 'openreview_id', 'level_1': 'flaw_id'}, inplace=True)
    df = pd.merge(df, review_df, on=['openreview_id', 'flaw_id'], how='left')
    print(f"Data shape after merging LLM review data: {df.shape}")
    
    # --- 4. Finalize and Save ---
    print("\nStep 4: Finalizing and saving the dataset...")
    # Ensure all required columns are present
    final_columns = [
        'openreview_id', 'flaw_id', 'category_ids', 'flaw_description',
        'llm_review', 'is_flaw_mentioned', 'mention_reasoning'
    ]
    for col in final_columns:
        if col not in df.columns:
            df[col] = None
    
    # Reorder columns and save to CSV
    final_df = df[final_columns]
    final_df.to_csv(output_filename, index=False)
    print(f"\nSuccessfully created aggregated dataset with {len(final_df)} rows.")
    print(f"File saved to: {output_filename}")


if __name__ == '__main__':
    # --- Configuration ---
    # Define the target venue and directory structure
    VENUE_FOLDER_NAME = 'NeurIPS2024_latest_flawed_papers_v1'
    BASE_DATA_DIRECTORY = '../data' # Adjusted to match provided folder structure
    CATEGORIZED_DATA_DIRECTORY = './extracted_data'
    
    # Define the name for the output CSV file
    OUTPUT_FILENAME = 'neurips_2024_aggregated_flaws.csv'

    # Run the aggregation process
    create_aggregated_dataset(VENUE_FOLDER_NAME, BASE_DATA_DIRECTORY, CATEGORIZED_DATA_DIRECTORY, OUTPUT_FILENAME)



Step 1: Loading base data...
Loaded and merged base data. Shape: (2136, 4)

Step 2: Collecting metareview data...
Scanning for metareview data in: ../data/metareviews
Collected mention data for 2136 flaws.
Data shape after merging mention data: (2136, 6)

Step 3: Collecting LLM review data...
Scanning for LLM review data in: ../data/reviews
Collected LLM reviews for 2136 flaws.
Data shape after merging LLM review data: (2136, 7)

Step 4: Finalizing and saving the dataset...

Successfully created aggregated dataset with 2136 rows.
File saved to: neurips_2024_aggregated_flaws.csv


In [3]:
df = pd.read_csv(OUTPUT_FILENAME)
df.head()

Unnamed: 0,openreview_id,flaw_id,category_ids,flaw_description,llm_review,is_flaw_mentioned,mention_reasoning
0,wSqpNeMVLU,missing_real_world_batch_experiments,"""1b,3b""","Reviewers qBZ2, GGsr, and the program chairs a...","{\n ""summary"": ""The paper develops a unified ...",True,"The review states: ""Empirical validation is mi..."
1,bioHNTRnQk,kernel_regression_claim,"""3b""",The paper repeatedly states that its theoretic...,"{\n ""summary"": ""The paper develops a theoreti...",False,The review treats the kernel ridge regression ...
2,7W0f7lifDk,low_output_resolution,"""2a""",The method currently operates at only 256×256 ...,"{\n ""summary"": ""The paper introduces Human-3D...",False,The review never refers to the 256×256 output ...
3,UahrHR5HQh,missing_comparison_dirichlet_flow,"""1a""",Reviewer vH9B identified the absence of an exp...,"{\n ""summary"": ""The paper reframes Flow Match...",True,"The review states: ""Baseline coverage is incom..."
4,CW0OVWEKKu,lack_of_rigorous_theory,"""2b""",Reviewers highlighted the absence of a formal ...,"{\n ""summary"": ""The paper revisits the proble...",True,"The review explicitly states: ""**Heuristic, no..."


In [4]:
df = df[['openreview_id', 'flaw_id', 'flaw_description']]
df.head()

Unnamed: 0,openreview_id,flaw_id,flaw_description
0,wSqpNeMVLU,missing_real_world_batch_experiments,"Reviewers qBZ2, GGsr, and the program chairs a..."
1,bioHNTRnQk,kernel_regression_claim,The paper repeatedly states that its theoretic...
2,7W0f7lifDk,low_output_resolution,The method currently operates at only 256×256 ...
3,UahrHR5HQh,missing_comparison_dirichlet_flow,Reviewer vH9B identified the absence of an exp...
4,CW0OVWEKKu,lack_of_rigorous_theory,Reviewers highlighted the absence of a formal ...


In [5]:
df.to_csv(f"{OUTPUT_FILENAME.split('.')[0]}_shortened.csv", index=False)

In [59]:
import openreview
import os
import pandas as pd
import re
from dotenv import load_dotenv
from tqdm import tqdm
import json

# Load environment variables from a .env file
load_dotenv()

# --- Configuration for Reviewer ID Extraction ---

# A blocklist of common 4-letter words, acronyms, and other false positives.
# This helps prevent matching common English words or technical terms.
REVIEWER_ID_BLOCKLIST = {
    'thus', 'flow', 'they', 'this', 'self', 'both', 'relu', 'gnns', 'llms',
    'snip', 'elbo', 'geom', 'grad', 'mesa', '2sls', 'ivar', 'wdcf', 'from',
    'with', 'that', 'what', 'when', 'were', 'have', 'been', 'also', 'some'
}


def extract_reviewers_from_flaws(input_csv_path: str, output_csv_path: str):
    """
    Reads a CSV with flaw descriptions, extracts potential reviewer IDs, filters out
    false positives, and writes a new CSV with the valid IDs. Rows without any
    found IDs are discarded.

    Args:
        input_csv_path (str): Path to the input CSV file.
        output_csv_path (str): Path to save the intermediate output CSV file.
    """
    print(f"Reading flaws from {input_csv_path}...")
    try:
        df = pd.read_csv(input_csv_path)
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_csv_path}")
        return

    # This regex is more specific, targeting only 4-character alphanumeric strings.
    # Further filtering is applied in the code to increase accuracy.
    reviewer_id_pattern = re.compile(r'\b[a-zA-Z0-9]{4}\b')

    results = []
    print("Extracting and filtering reviewer IDs from flaw descriptions...")
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Scanning descriptions"):
        description = row.get('flaw_description', '')
        if pd.isna(description):
            description = ''

        # Find all potential 4-character IDs
        found_ids = reviewer_id_pattern.findall(str(description))
        
        # Filter out false positives
        filtered_ids = []
        for an_id in found_ids:
            # Rule 1: Discard if it's a pure number (e.g., '2024')
            if an_id.isdigit():
                continue
            # Rule 2: Discard if it's a common word or acronym in our blocklist
            if an_id.lower() in REVIEWER_ID_BLOCKLIST:
                continue
            filtered_ids.append(an_id)

        # Get the unique set of valid IDs
        unique_ids = sorted(list(set(filtered_ids)))

        # Only include rows where at least one valid reviewer ID was found
        if unique_ids:
            results.append({
                'openreview_id': row['openreview_id'],
                'flaw_id': row['flaw_id'],
                'reviewers_ids': ','.join(unique_ids)  # Join IDs into a single string
            })

    if not results:
        print("Warning: No valid reviewer IDs were found in any of the descriptions. The output file will be empty.")
        # Create an empty file with the correct headers
        pd.DataFrame(columns=['openreview_id', 'flaw_id', 'reviewers_ids']).to_csv(output_csv_path, index=False)
        return

    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv_path, index=False)
    print(f"Successfully generated intermediate CSV with {len(output_df)} entries at: {output_csv_path}")

def get_openreview_client():
    """Initializes and returns a connected OpenReview v2 client."""
    username = os.environ.get('OPENREVIEW_USERNAME')
    password = os.environ.get('OPENREVIEW_PASSWORD')
    if not username or not password:
        raise ValueError("OPENREVIEW_USERNAME and OPENREVIEW_PASSWORD environment variables must be set in a .env file.")

    try:
        client = openreview.api.OpenReviewClient(
            baseurl='https://api2.openreview.net',
            username=username,
            password=password
        )
        return client
    except Exception as e:
        print(f"Failed to connect to OpenReview: {e}")
        return None

def fetch_reviews_and_generate_final_csv(intermediate_csv_path: str, output_csv_path: str):
    """
    Reads the intermediate CSV, fetches review content for each paper individually,
    and generates the final CSV with the detailed reviews.

    Args:
        intermediate_csv_path (str): Path to the CSV with reviewer IDs.
        output_csv_path (str): Path to save the final output CSV file.
    """
    client = get_openreview_client()
    if not client:
        return
        
    try:
        df = pd.read_csv(intermediate_csv_path)
        df = df.dropna(subset=['reviewers_ids'])
        df = df[df['reviewers_ids'] != '']
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"Intermediate file not found or is empty at {intermediate_csv_path}. Cannot fetch reviews.")
        pd.DataFrame(columns=['openreview_id', 'flaw_id', 'reviewer_id', 'human_review']).to_csv(output_csv_path, index=False)
        return

    final_results = []
    print(f"Fetching reviews for {len(df)} entries with identified reviewers...")
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing papers"):
        openreview_id = row['openreview_id']
        flaw_id = row['flaw_id']
        reviewer_ids = str(row['reviewers_ids']).split(',')

        try:
            # --- Per-Paper API Call ---
            note = client.get_note(openreview_id, details='replies')
            note_replies = note.details.get('replies', [])
            
            if not note_replies:
                tqdm.write(f"Note for '{openreview_id}' found, but it has no replies/reviews. Skipping.")
                continue

            review_map = {}
            for reply in note_replies:
                # Official reviews are identified by having content like 'rating', 'strengths', etc.
                if reply and 'signatures' in reply and 'content' in reply and any(key in reply.get('content', {}) for key in ['review', 'rating', 'strengths', 'weaknesses', 'summary']):
                    for sig in reply['signatures']:
                        # Assumes signature format like '.../Reviewer_mZ7h'
                        reviewer_id_from_sig = sig.split('_')[-1]
                        review_content_parts = []
                        for key, value_obj in reply['content'].items():
                            if isinstance(value_obj, dict) and 'value' in value_obj:
                                actual_value = value_obj.get('value')
                                if isinstance(actual_value, str) and actual_value.strip():
                                    review_content_parts.append(f"## {key.replace('_', ' ').title()}\n{actual_value.strip()}")
                        review_map[reviewer_id_from_sig] = "\n\n".join(review_content_parts)


            for target_id in reviewer_ids:
                if target_id in review_map:
                    final_results.append({
                        'openreview_id': openreview_id,
                        'flaw_id': flaw_id,
                        'reviewer_id': target_id,
                        'human_review': review_map[target_id]
                    })
                else:
                    pass
                    # tqdm.write(f"Warning: Could not match review for ID '{target_id}' in paper '{openreview_id}'.")
        
        # except openreview.core.OpenReviewException as e:
        #     tqdm.write(f"OpenReview API Error for '{openreview_id}': {e}. Skipping.")
        except Exception as e:
            tqdm.write(f"An unexpected error occurred while processing {openreview_id}: {e}. Skipping.")

    if not final_results:
        print("No review data was successfully matched. The final CSV will be empty.")
        pd.DataFrame(columns=['openreview_id', 'flaw_id', 'reviewer_id', 'human_review']).to_csv(output_csv_path, index=False)
    else:
        output_df = pd.DataFrame(final_results)
        output_df.to_csv(output_csv_path, index=False)
        print(f"Successfully generated final reviews CSV at: {output_csv_path}")

def run_review_processing(input_csv_path: str, output_dir: str = "output"):
    """
    Orchestrates the two-step process of extracting reviewer IDs and fetching reviews.

    Args:
        input_csv_path (str): Path to the input CSV file containing flaw data.
        output_dir (str, optional): Directory to save the output files. Defaults to "output".
    """
    os.makedirs(output_dir, exist_ok=True)
    intermediate_csv_path = os.path.join(output_dir, 'flaws_with_reviewers.csv')
    final_csv_path = os.path.join(output_dir, 'reviews_data.csv')

    print("--- Step 1: Extracting Reviewer IDs ---")
    extract_reviewers_from_flaws(input_csv_path, intermediate_csv_path)

    print("\n--- Step 2: Fetching Full Reviews from API (Iterative) ---")
    fetch_reviews_and_generate_final_csv(intermediate_csv_path, final_csv_path)
    
    print("\nProcessing complete.")

if __name__ == "__main__":
    # --- Configuration ---
    INPUT_FILE = "sample_input.csv"
    OUTPUT_DIR = "output"
    # -------------------

    print(f"Running script with settings: Input='{INPUT_FILE}'...")
    
    if not os.path.exists(INPUT_FILE):
        print(f"\nERROR: The input file '{INPUT_FILE}' was not found.")
        print("Please create it or update the INPUT_FILE variable in the script.")
    else:
        run_review_processing(input_csv_path=INPUT_FILE, output_dir=OUTPUT_DIR)



Running script with settings: Input='sample_input.csv'...
--- Step 1: Extracting Reviewer IDs ---
Reading flaws from sample_input.csv...
Extracting and filtering reviewer IDs from flaw descriptions...


Scanning descriptions: 100%|██████████| 2136/2136 [00:00<00:00, 23957.13it/s]

Successfully generated intermediate CSV with 2066 entries at: output/flaws_with_reviewers.csv

--- Step 2: Fetching Full Reviews from API (Iterative) ---





Fetching reviews for 2066 entries with identified reviewers...


Processing papers: 100%|██████████| 2066/2066 [07:36<00:00,  4.52it/s] 


Successfully generated final reviews CSV at: output/reviews_data.csv

Processing complete.
