In [1]:
### Imports
import pandas as pd
import numpy as np

## Data Analysis

This notebook analyzes evaluation results comparing AI-generated chart interpretations under two conditions:

1. **Image only**: o4-mini receives chart without context
2. **Image + context**: o4-mini receives chart with surrounding textual context

Each interpretation was evaluated on four quality dimensions using 7-point Likert scales:
- Accuracy
- Clarity  
- Completeness
- Relevance

Plus an overall preference rating.

### Research Questions

1. Do context-enhanced interpretations score higher on quality dimensions?
2. Which condition do evaluators prefer overall?
3. How do the quality dimensions correlate with each other?

###  1. Dataset Loading & Preparation

In [None]:
# Reading the data
df = pd.read_csv('data/evaluation_results.csv')
df

Unnamed: 0,item_index,preference,text_a_has_context,text_b_has_context,with_context_accuracy,with_context_clarity,with_context_relevance,with_context_completeness,without_context_accuracy,without_context_clarity,without_context_relevance,without_context_completeness,preference_actual
0,0,A,True,False,6,6,7,6,6,7,7,5,with_context
1,1,B,True,False,5,6,6,6,7,7,7,7,without_context
2,2,Equal,True,False,6,4,5,5,5,5,7,5,equal
3,3,B,True,False,4,5,5,6,4,5,6,6,without_context
4,4,A,True,False,5,6,7,6,4,5,6,6,with_context
...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,56,B,False,True,6,7,7,7,5,6,6,7,with_context
57,57,Equal,False,True,6,6,7,7,6,7,7,6,equal
58,58,A,True,False,5,7,7,7,4,6,5,5,with_context
59,59,B,False,True,7,7,7,6,6,6,7,7,with_context


#### 1.1 Converting the dataset into long format


In [None]:
metrics = ['accuracy', "clarity", "relevance", "completeness"]
conditions = ["with_context", "without_context"]

# Converting the dataset to long format
long_rows = []
for _, row in df.iterrows():
    for cond in conditions:
        long_rows.append({
            "item_index": row.item_index,
            "condition": cond,
            **{m: row[f"{cond}_{m}"] for m in metrics},
            "overall": sum(row[f"{cond}_{m}"] for m in metrics)/len(metrics),
            "preference": row.preference_actual
        })

# Save the long format DataFrame
long_df = pd.DataFrame(long_rows)
long_df

Unnamed: 0,item_index,condition,accuracy,clarity,relevance,completeness,overall,preference
0,0,1,6,6,7,6,6.25,1
1,0,0,6,7,7,5,6.25,1
2,1,1,5,6,6,6,5.75,0
3,1,0,7,7,7,7,7.00,0
4,2,1,6,4,5,5,5.00,-1
...,...,...,...,...,...,...,...,...
117,58,0,4,6,5,5,5.00,1
118,59,1,7,7,7,6,6.75,1
119,59,0,6,6,7,7,6.50,1
120,60,1,7,7,7,7,7.00,1


#### 1.2 Factorizing categorical variables

In [15]:
# Factorizing the condition & preference columns using the map function
## NB: Matching condition to preference
long_df['condition'] = long_df['condition'].map({
    "with_context": 1,
    "without_context": 0
})

long_df['preference'] = long_df['preference'].map({
    "with_context": 1,
    "without_context": 0,
    "equal": -1
})

### 2. Addressing Research Questions