## Loading a new rating task is easy! Just follow these steps:

### 1. Create a new config object in the TASK_CONFIGS dictionary of `configs/tasks.py`.


In [7]:
# Populate all fields as described below:
TASK_CONFIGS = {}
TASK_CONFIGS['my_task'] = {
    'path': 'TopicalChat/processed_ratings.csv',      # Path to your dataset (including system inputs/outputs and human ratings)
    'property': 'Understandable',                      # Name of the property being rated
    'n_options': 2,                                    # Number of forced choice options
    'n_response_sets': 3,                              # Number of response sets (should be 2^n_options - 1)
    'positive_categorization_options': [0],            # Indices of "positive" options
                                                        # For Yes/No: [0] means 'Yes' is positive
                                                        # For A/B/C: [0,1] means A and B are positive, C is negative
                                                        # Used for downstream metrics like bias and consistency
    'prompt_fields': ['context', 'fact', 'response'],  # Fields used to populate the rating prompt from the CSV
    'valid_fc_tokens': ['A', 'B'],                     # Forced choice options returned by the LLM-as-a-Judge
    'valid_rs_tokens': ['A', 'B', 'AB'],               # Response set options returned by the LLM-as-a-Judge
    'ratings_per_item': '3',                           # Number of human ratings per item (for display only)
    'display': 'TopicalChat\nUnderstandable'           # Human readable name of the task for plots
}


### 2a. In configs/tasks.py -> get_task_forced_choice_distribution(), add a mapping that converts your human ratings columns to rating distributions:


In [11]:
def get_task_forced_choice_distribution(task_name, df):
    if task_name == 'my_task':
        # Convert your specific rating columns to probabilities
        # Example for binary Yes/No task:
        df['Yes'] = df['n_positive'] / df['total_raters']  
        df['No'] = df['n_negative'] / df['total_raters']
        return df[['Yes', 'No']]
    # ... existing code ...

### 2b. In configs/tasks.py -> get_task_reverse_fc_translation(), configure how forced-choice responses map back to response sets for your task:

In [12]:
def get_task_reverse_fc_translation(task_name, n_items, n_options, n_response_sets, beta=1):
    """
    Defines the forced-choice translation matrix F' that maps forced-choice 
    responses back to response set distributions. This captures how raters
    resolve rating indeterminacy when forced to select a single option.
    """
    F_prime_template = np.zeros((n_options + 1, n_response_sets + 1))  
    # +1 for invalid response handling. 
    # LLM refusals and invalid outputs get assigned this "null" option or response set 
    
    if task_name == 'my_task':
        # For binary task with options: Yes(0), No(1)
        # Response sets: {Yes}(0), {No}(1), {Yes,No}(2), Invalid(3)
        
        F_prime_template[0, 0] = 1         # Yes -> definitely came from {Yes}
        F_prime_template[1, 1] = 1-beta    # No -> came from {No} 
        F_prime_template[1, 0] = beta      # No -> might have come from {Yes} (indeterminacy)
        F_prime_template[2, 3] = 1         # Invalid responses
        
    return np.tile(F_prime_template, (n_items, 1, 1))

# The beta parameter is crucial:
# - It represents P(positive ∈ response_set | forced_choice = negative)
# - Set beta based on your task's expected indeterminacy level
# - Can be estimated from paired FC/RS ratings if available

### 3. Define your rating prompts

In [None]:
PROMPTS['my_task'] = {
    'FC': """Rate if {context} is {property}: {fact}
    
    Select ONE option that best applies:
    A. Yes - The response is understandable
    B. No - The response is not understandable
    
    RESPONSE FORMAT:
    - Provide only a single letter: A or B
    - No spaces, punctuation, or explanations
    
    Response: """,
    
    'RS': """Rate if {context} is {property}: {fact}
    
    Select ALL options that could reasonably apply:
    A. Yes - The response is understandable  
    B. No - The response is not understandable
    
    RESPONSE FORMAT:
    - Provide only the letter sequence (e.g., "A", "B", or "AB")
    - No spaces, punctuation, or explanations
    
    Response: """
}
# Note: Variables in {} will be replaced with values from columns specified in prompt_fields

## 4. Prepare your data file

Save your data as a CSV in the `datasets/` directory with the following structure:

### Required columns

| Column Type | Description | Example |
|------------|-------------|---------|
| **Prompt fields** | Must match `prompt_fields` in config | `context`, `fact`, `response` |
| **Rating columns** | Human annotation counts | `n_positive`, `n_negative`, `total_raters` |

### Requirements
- **Minimum 3 human ratings per item** for estimating forced choice distribution
- **Column names must exactly match** those specified in `prompt_fields`
- **Save location:** `datasets/your_task_name.csv`

### Example CSV structure
| context | fact | response | n_positive | n_negative | total_raters |
|---------|------|----------|------------|------------|--------------|
| User question | Model claim | Model response | 7 | 3 | 10 |
| What's the weather? | It will rain tomorrow | Based on the forecast, yes | 8 | 2 | 10 |
| How do I cook pasta? | Boil water first | Start by bringing a large pot of salted water to a boil | 9 | 1 | 10 |

# 5. Test your configuration:

After completing steps 1-2b above, test your configuration:

In [None]:
# 5.1 Verify data loads correctly
task_name = 'my_task'
task_config = tasks.TASK_CONFIGS[task_name]
corpus = loader.load_data(task_name, task_config, use_hf=False)
print(f"✓ Loaded {len(corpus)} items")
print("\nFirst few items:")
print(corpus.head())

# 5.2 Check prompt formatting works
sample_item = corpus.iloc[0]
fc_prompt = prompts.PROMPTS[task_name]['FC'].format(**{
    field: sample_item[field] for field in task_config['prompt_fields']
})
print("\n✓ Sample FC prompt (first 300 chars):")
print(fc_prompt[:300])

# 5.3 Verify human rating distribution
fc_dist = tasks.get_task_forced_choice_distribution(task_name, corpus)
print(f"\n✓ Human rating distribution shape: {fc_dist.shape}")
print("Sample distribution for first item:")
print(fc_dist.iloc[0])

# 5.4 Test with mock API calls for quick validation
client = ModelAPIClient(
    task_configs={task_name: task_config},
    models=[models.MODELS[3]],  # Just test with one model
)

run_tag = f'test_{task_name}'
client.run_tasks(run_tag=run_tag, mock=True)
print(f"\n✓ Mock run completed for {task_name}")

# If all checks pass, your task is ready to use!