In [1]:
import os
import json
import pandas as pd
import pickle
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from dotenv import load_dotenv
import ast
import re

In [2]:
# Set up paths
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
DATASET_DIR = os.path.join(BASE_DIR, "2025_dataset", "train")
IMAGES_DIR = os.path.join(DATASET_DIR, "images_train")

In [3]:
# Load question definitions
questions_path = os.path.join(DATASET_DIR, "closedquestions_definitions_imageclef2025.json")
with open(questions_path, 'r') as f:
    questions = json.load(f)
    
# Convert to DataFrame for easier manipulation
questions_df = pd.json_normalize(questions)[["qid", "question_en", "options_en", "question_type_en", "question_category_en"]]

# Load train data with query information
train_json_path = os.path.join(DATASET_DIR, "train.json")
train_df = pd.read_json(train_json_path)

# Extract relevant columns including query content and title
query_info_df = train_df[["encounter_id", "image_ids", "query_title_en", "query_content_en", "author_id"]]

# Load CVQA data (ground truth answers)
cvqa_path = os.path.join(DATASET_DIR, "train_cvqa.json")
with open(cvqa_path, 'r') as f:
    cvqa_data = json.load(f)
cvqa_df = pd.json_normalize(cvqa_data)

# Melt to get one row per question
cvqa_long = cvqa_df.melt(id_vars=["encounter_id"], 
                         var_name="qid", 
                         value_name="answer_index")

# Filter out encounter_id rows
cvqa_long = cvqa_long[cvqa_long["qid"] != "encounter_id"]

# Merge CVQA with questions
cvqa_merged = cvqa_long.merge(questions_df, on="qid", how="left")

# Get answer text
def get_answer_text(row):
    try:
        return row["options_en"][row["answer_index"]]
    except (IndexError, TypeError):
        return None

cvqa_merged["answer_text"] = cvqa_merged.apply(get_answer_text, axis=1)

# Merge with train data
final_df = cvqa_merged.merge(query_info_df, on="encounter_id", how="left")

In [4]:
final_df.head()

Unnamed: 0,encounter_id,qid,answer_index,question_en,options_en,question_type_en,question_category_en,answer_text,image_ids,query_title_en,query_content_en,author_id
0,ENC00001,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",Site,General,limited area,"[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]",Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473
1,ENC00002,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",Site,General,limited area,"[IMG_ENC00002_00001.jpg, IMG_ENC00002_00002.jp...",What is on the bottom of the right foot?,"The patient is a 50-year-old male, who has bee...",U06063
2,ENC00003,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",Site,General,limited area,"[IMG_ENC00003_00001.jpg, IMG_ENC00003_00002.jp...",Interpreting Images - Is it magical skin?,"Male, 65 years old, skin lesions as shown in t...",U00780
3,ENC00004,CQID010-001,2,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",Site,General,widespread,"[IMG_ENC00004_00001.jpg, IMG_ENC00004_00002.jpg]",Skin Disease,"Male, 15 years old, keratosis on both palms, s...",U00209
4,ENC00005,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",Site,General,limited area,[IMG_ENC00005_00001.jpg],Perifollicular atrophy?,"Young female, silver-gray dot-like atrophy spo...",U09050


In [5]:
# Extract the base CQID code
final_df['base_qid'] = final_df['qid'].str.extract(r'(CQID\d+)')

# Group by encounter_id and base_qid to see all answers for each question family
grouped_by_family = final_df.groupby(['encounter_id', 'base_qid']).agg({
    'qid': list,
    'question_en': list,
    'answer_text': list,
    'answer_index': list,
    'image_ids': 'first',
    'options_en': 'first',
    'question_type_en': 'first',
    'question_category_en': 'first',
    'query_title_en': 'first',
    'query_content_en': 'first',
    'author_id': 'first'
})

# Reset index for easier manipulation
grouped_by_family = grouped_by_family.reset_index()

# Filter to keep only question families with multiple questions
question_family_counts = final_df['base_qid'].value_counts()
multi_question_families = question_family_counts[question_family_counts > 1].index.tolist()

# Filter our grouped data to keep only these multi-question families
multi_question_data = grouped_by_family[grouped_by_family['base_qid'].isin(multi_question_families)]

# Modified function to extract all valid answers (treating "Not mentioned" appropriately)
def get_valid_answers(row):
    """
    Extract all valid answers, with special handling for "Not mentioned".
    If "Not mentioned" is the only answer for all slots, we keep it.
    Otherwise, we collect all non-"Not mentioned" answers.
    """
    answers = row['answer_text']
    answer_indices = row['answer_index']
    
    if all(ans == "Not mentioned" for ans in answers):
        return ["Not mentioned"], [answer_indices[0]]  # If all are "Not mentioned", return it as valid
    
    valid_answers = []
    valid_indices = []
    
    for i, ans in enumerate(answers):
        if ans != "Not mentioned" and ans not in valid_answers:
            valid_answers.append(ans)
            valid_indices.append(answer_indices[i])
    
    return valid_answers, valid_indices

# Apply to all question families
multi_question_data[['valid_answers', 'valid_indices']] = multi_question_data.apply(
    lambda row: pd.Series(get_valid_answers(row)), axis=1)

In [6]:
multi_question_data.head()

Unnamed: 0,encounter_id,base_qid,qid,question_en,answer_text,answer_index,image_ids,options_en,question_type_en,question_category_en,query_title_en,query_content_en,author_id,valid_answers,valid_indices
0,ENC00001,CQID010,[CQID010-001],[How much of the body is affected?],[limited area],[1],"[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[single spot, limited area, widespread, Not me...",Site,General,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,[limited area],[1]
1,ENC00001,CQID011,"[CQID011-001, CQID011-002, CQID011-003, CQID01...","[1 Where is the affected area?, 2 Where is the...","[back, Not mentioned, Not mentioned, Not menti...","[5, 7, 7, 7, 7, 7]","[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[head, neck, upper extremities, lower extremit...",Site Location,General,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,[back],[5]
2,ENC00001,CQID012,"[CQID012-001, CQID012-002, CQID012-003, CQID01...",[1 How large are the affected areas? Please sp...,"[size of palm, Not mentioned, Not mentioned, N...","[1, 3, 3, 3, 3, 3]","[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[size of thumb nail, size of palm, larger area...",Size,General,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,[size of palm],[1]
3,ENC00001,CQID015,[CQID015-001],[When did the patient first notice the issue?],[Not mentioned],[6],"[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[within hours, within days, within weeks, with...",Onset,General,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,[Not mentioned],[6]
4,ENC00001,CQID020,"[CQID020-001, CQID020-002, CQID020-003, CQID02...",[1 What label best describes the affected area...,"[thick or raised, Not mentioned, Not mentioned...","[3, 9, 9, 9, 9, 9, 9, 9, 9]","[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[raised or bumpy, flat, skin loss or sunken, t...",Skin Description,Skin Specific,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,[thick or raised],[3]


In [7]:
# Let's count how many question families have more than one valid answer
valid_answer_counts = multi_question_data['valid_answers'].apply(len)
multi_valid_answer_counts = valid_answer_counts[valid_answer_counts > 1].count()

print(f"Number of question families with more than one valid answer: {multi_valid_answer_counts}")
single_valid_answer_counts = valid_answer_counts[valid_answer_counts == 1].count()
print(f"Number of question families with one valid answer: {single_valid_answer_counts}")
print(f"Total number of multi-question families: {len(multi_question_data)}")
print(f"Percentage with multiple valid answers: {multi_valid_answer_counts / len(multi_question_data) * 100:.2f}%")

# Let's look at some examples of question families with multiple valid answers
examples_multi_valid = multi_question_data[valid_answer_counts > 1].head(5)
for _, row in examples_multi_valid.iterrows():
    print(f"Encounter ID: {row['encounter_id']}, Base QID: {row['base_qid']}")
    print(f"Questions: {row['qid']}")
    print(f"All answers: {row['answer_text']}")
    print(f"Valid answers: {row['valid_answers']}")
    print()

Number of question families with more than one valid answer: 177
Number of question families with one valid answer: 2523
Total number of multi-question families: 2700
Percentage with multiple valid answers: 6.56%
Encounter ID: ENC00003, Base QID: CQID011
Questions: ['CQID011-001', 'CQID011-002', 'CQID011-003', 'CQID011-004', 'CQID011-005', 'CQID011-006']
All answers: ['back', 'chest/abdomen', 'Not mentioned', 'Not mentioned', 'Not mentioned', 'Not mentioned']
Valid answers: ['back', 'chest/abdomen']

Encounter ID: ENC00003, Base QID: CQID020
Questions: ['CQID020-001', 'CQID020-002', 'CQID020-003', 'CQID020-004', 'CQID020-005', 'CQID020-006', 'CQID020-007', 'CQID020-008', 'CQID020-009']
All answers: ['raised or bumpy', 'scab', 'Not mentioned', 'Not mentioned', 'Not mentioned', 'Not mentioned', 'Not mentioned', 'Not mentioned', 'Not mentioned']
Valid answers: ['raised or bumpy', 'scab']

Encounter ID: ENC00004, Base QID: CQID011
Questions: ['CQID011-001', 'CQID011-002', 'CQID011-003', 'C

In [8]:
# Get instances with only one valid answer
single_valid_answer_data = multi_question_data[multi_question_data['valid_answers'].apply(len) == 1]

# Initialize a counter for unique combinations
unique_combinations = 0

# Iterate through each row in single_valid_answer_data
for _, row in single_valid_answer_data.iterrows():
    encounter_id = row['encounter_id']
    base_qid = row['base_qid']
    image_ids = row['image_ids']
    
    # Count each unique combination of encounter_id, base_qid, image_id
    for image_id in image_ids:
        unique_combinations += 1

print(f"Number of unique encounter_id-base_qid-image_id combinations with one valid answer: {unique_combinations}")

Number of unique encounter_id-base_qid-image_id combinations with one valid answer: 7280


In [9]:
# Get instances with more than one valid answer
multi_valid_answer_data = multi_question_data[multi_question_data['valid_answers'].apply(len) > 1]

# Initialize a counter for unique combinations
unique_combinations = 0

# Iterate through each row in multi_valid_answer_data
for _, row in multi_valid_answer_data.iterrows():
    encounter_id = row['encounter_id']
    base_qid = row['base_qid']
    image_ids = row['image_ids']
    
    # Count each unique combination of encounter_id, base_qid, image_id
    for image_id in image_ids:
        unique_combinations += 1

print(f"Number of unique encounter_id-base_qid-image_id combinations with multiple valid answers: {unique_combinations}")

Number of unique encounter_id-base_qid-image_id combinations with multiple valid answers: 613


In [10]:
# How many unique encounters do we have?
num_encounters = final_df['encounter_id'].nunique()
print(f"Total number of unique encounters: {num_encounters}")

# How many unique question families have multiple slots?
print(f"Number of question families with multiple slots: {len(multi_question_families)}")
print(f"These families are: {multi_question_families}")

# How many encounter-question family combinations do we have in our multi_question_data?
print(f"Total rows in multi_question_data: {len(multi_question_data)}")

# Let's count per question family
family_counts = multi_question_data['base_qid'].value_counts()
print("\nCount of encounters per question family:")
for family, count in family_counts.items():
    print(f"{family}: {count} encounters")

# And now, how many of these have multiple valid answers?
print("\nCount of encounters with multiple valid answers per question family:")
for family in multi_question_families:
    family_data = multi_question_data[multi_question_data['base_qid'] == family]
    multi_valid = family_data['valid_answers'].apply(len) > 1
    count = multi_valid.sum()
    print(f"{family}: {count} encounters with multiple valid answers out of {len(family_data)} total")

Total number of unique encounters: 300
Number of question families with multiple slots: 9
These families are: ['CQID020', 'CQID012', 'CQID011', 'CQID010', 'CQID015', 'CQID025', 'CQID034', 'CQID035', 'CQID036']
Total rows in multi_question_data: 2700

Count of encounters per question family:
CQID010: 300 encounters
CQID011: 300 encounters
CQID012: 300 encounters
CQID015: 300 encounters
CQID020: 300 encounters
CQID025: 300 encounters
CQID034: 300 encounters
CQID035: 300 encounters
CQID036: 300 encounters

Count of encounters with multiple valid answers per question family:
CQID020: 75 encounters with multiple valid answers out of 300 total
CQID012: 9 encounters with multiple valid answers out of 300 total
CQID011: 93 encounters with multiple valid answers out of 300 total
CQID010: 0 encounters with multiple valid answers out of 300 total
CQID015: 0 encounters with multiple valid answers out of 300 total
CQID025: 0 encounters with multiple valid answers out of 300 total
CQID034: 0 encount

In [14]:
# Create a simplified dataset treating everything as multi-label
def create_multi_label_dataset():
    multi_label_data = []
    
    # Process all data as multi-label
    for _, row in tqdm(multi_question_data.iterrows(), desc="Creating multi-label dataset"):
        encounter_id = row['encounter_id']
        base_qid = row['base_qid']
        valid_answers = row['valid_answers']
        valid_indices = row['valid_indices']
        image_ids = row['image_ids']
        question_text = row['question_en'][0]  # Taking the first question as reference
        query_title = row['query_title_en']
        query_content = row['query_content_en']
        author_id = row['author_id']
        options_en = row['options_en']
        question_type_en = row['question_type_en']
        question_category_en = row['question_category_en']
        
        # For each image in the encounter
        for img_id in image_ids:
            img_path = os.path.join(IMAGES_DIR, img_id)
            
            # Skip if image doesn't exist
            if not os.path.exists(img_path):
                continue
                
            multi_label_data.append({
                'encounter_id': encounter_id,
                'base_qid': base_qid,
                'image_id': img_id,
                'image_path': img_path,
                'valid_answers': valid_answers,
                'valid_indices': valid_indices,
                'question_text': question_text,
                'query_title_en': query_title,
                'query_content_en': query_content,
                'author_id': author_id,
                'options_en': options_en,
                'question_type_en': question_type_en, 
                'question_category_en': question_category_en,
#                 'multi_label': '|'.join(valid_answers),
#                 'multi_label_indices': valid_indices,
                'is_multi_label': len(valid_answers) > 1
            })
    
    # Convert to DataFrame
    multi_label_df = pd.DataFrame(multi_label_data)

    return multi_label_df

# Create the dataset
multi_label_dataset = create_multi_label_dataset()

multi_label_dataset.head()

Creating multi-label dataset: 0it [00:00, ?it/s]

Unnamed: 0,encounter_id,base_qid,image_id,image_path,valid_answers,valid_indices,question_text,query_title_en,query_content_en,author_id,options_en,question_type_en,question_category_en,is_multi_label
0,ENC00001,CQID010,IMG_ENC00001_00001.jpg,/storage/coda1/p-dsgt_clef2025/0/kthakrar3/med...,[limited area],[1],How much of the body is affected?,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,"[single spot, limited area, widespread, Not me...",Site,General,False
1,ENC00001,CQID010,IMG_ENC00001_00002.jpg,/storage/coda1/p-dsgt_clef2025/0/kthakrar3/med...,[limited area],[1],How much of the body is affected?,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,"[single spot, limited area, widespread, Not me...",Site,General,False
2,ENC00001,CQID011,IMG_ENC00001_00001.jpg,/storage/coda1/p-dsgt_clef2025/0/kthakrar3/med...,[back],[5],1 Where is the affected area?,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,"[head, neck, upper extremities, lower extremit...",Site Location,General,False
3,ENC00001,CQID011,IMG_ENC00001_00002.jpg,/storage/coda1/p-dsgt_clef2025/0/kthakrar3/med...,[back],[5],1 Where is the affected area?,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,"[head, neck, upper extremities, lower extremit...",Site Location,General,False
4,ENC00001,CQID012,IMG_ENC00001_00001.jpg,/storage/coda1/p-dsgt_clef2025/0/kthakrar3/med...,[size of palm],[1],1 How large are the affected areas? Please spe...,Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,U04473,"[size of thumb nail, size of palm, larger area...",Size,General,False


In [16]:
# Save the dataset
multi_label_dataset.to_csv("outputs/multi_label_dataset.csv", index=False)

# Print some statistics
print(f"Multi-label dataset created with {len(multi_label_dataset)} entries")
print(f"Number of entries with multiple labels: {multi_label_dataset['is_multi_label'].sum()}")

# Show sample of the dataset
print("\nSample of multi-label dataset:")
sample_cols = ['encounter_id', 'base_qid', 'image_id', 'question_text', 'is_multi_label', 'query_title_en']
multi_label_dataset[sample_cols].head()

Multi-label dataset created with 7893 entries
Number of entries with multiple labels: 613

Sample of multi-label dataset:


Unnamed: 0,encounter_id,base_qid,image_id,question_text,is_multi_label,query_title_en
0,ENC00001,CQID010,IMG_ENC00001_00001.jpg,How much of the body is affected?,False,Pleural effusion accompanied by rash
1,ENC00001,CQID010,IMG_ENC00001_00002.jpg,How much of the body is affected?,False,Pleural effusion accompanied by rash
2,ENC00001,CQID011,IMG_ENC00001_00001.jpg,1 Where is the affected area?,False,Pleural effusion accompanied by rash
3,ENC00001,CQID011,IMG_ENC00001_00002.jpg,1 Where is the affected area?,False,Pleural effusion accompanied by rash
4,ENC00001,CQID012,IMG_ENC00001_00001.jpg,1 How large are the affected areas? Please spe...,False,Pleural effusion accompanied by rash


In [17]:
# Show distribution of multi-label vs single-label by question type
question_type_stats = multi_label_dataset.groupby('base_qid').agg({
    'is_multi_label': 'mean',
    'encounter_id': 'nunique'
}).reset_index()
question_type_stats['percent_multi_label'] = question_type_stats['is_multi_label'] * 100
question_type_stats = question_type_stats.sort_values('percent_multi_label', ascending=False)

print("\nPercentage of multi-label entries by question type:")
question_type_stats[['base_qid', 'percent_multi_label', 'encounter_id']]


Percentage of multi-label entries by question type:


Unnamed: 0,base_qid,percent_multi_label,encounter_id
1,CQID011,39.90878,300
4,CQID020,26.22577,300
2,CQID012,3.762828,300
0,CQID010,0.0,300
3,CQID015,0.0,300
5,CQID025,0.0,300
6,CQID034,0.0,300
7,CQID035,0.0,300
8,CQID036,0.0,300
