In [4]:
import os
import json
import pandas as pd

In [5]:
# Define base directory relative to current notebook
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "2025_dataset", "train")

# Load train.json
train_json_path = os.path.join(DATA_DIR, "train.json")
train_df = pd.read_json(train_json_path)

# Keep relevant columns
train_df = train_df[[
    "encounter_id", "author_id", "image_ids", "responses", 
    "query_title_en", "query_content_en"
]]

# Convert image IDs to full image paths
def generate_image_paths(image_ids):
    return [os.path.normpath(os.path.join(DATA_DIR, "images_train", img)) for img in image_ids]

train_df["image_paths"] = train_df["image_ids"].apply(generate_image_paths)

# Flatten responses to just English content
train_df["responses_en"] = train_df["responses"].apply(
    lambda resp_list: [r["content_en"] for r in resp_list]
)

In [6]:
train_df.head()

Unnamed: 0,encounter_id,author_id,image_ids,responses,query_title_en,query_content_en,image_paths,responses_en
0,ENC00001,U04473,"[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[{'author_id': 'U00217', 'content_zh': '银屑病，似与...",Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,[Psoriasis seems to have no relation to pleura...
1,ENC00002,U06063,"[IMG_ENC00002_00001.jpg, IMG_ENC00002_00002.jp...","[{'author_id': 'U11305', 'content_zh': '脚气', '...",What is on the bottom of the right foot?,"The patient is a 50-year-old male, who has bee...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,[Beriberi]
2,ENC00003,U00780,"[IMG_ENC00003_00001.jpg, IMG_ENC00003_00002.jp...","[{'author_id': 'U01131', 'content_zh': '瘙痒症，有无...",Interpreting Images - Is it magical skin?,"Male, 65 years old, skin lesions as shown in t...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,"[Pruritus, is there any other special medical ..."
3,ENC00004,U00209,"[IMG_ENC00004_00001.jpg, IMG_ENC00004_00002.jpg]","[{'author_id': 'U06715', 'content_zh': '肢端角化病？...",Skin Disease,"Male, 15 years old, keratosis on both palms, s...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,"[Acrokeratosis?, Progressive Symmetrical Eryth..."
4,ENC00005,U09050,[IMG_ENC00005_00001.jpg],"[{'author_id': 'U09402', 'content_zh': '是否神经性皮...",Perifollicular atrophy?,"Young female, silver-gray dot-like atrophy spo...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,"[Is it neurodermatitis?, Impotence?, Lichen Sc..."


In [7]:
# Load train_cvqa.json
cvqa_path = os.path.join(DATA_DIR, "train_cvqa.json")
with open(cvqa_path, "r", encoding="utf-8") as f:
    cvqa_data = json.load(f)

cvqa_df = pd.json_normalize(cvqa_data)

# Melt to long format
cvqa_long = cvqa_df.melt(id_vars=["encounter_id"], 
                         var_name="qid", 
                         value_name="answer_index")

# Load closed questions
questions_path = os.path.join(DATA_DIR, "closedquestions_definitions_imageclef2025.json")
with open(questions_path, "r", encoding="utf-8") as f:
    questions = json.load(f)

questions_df = pd.json_normalize(questions)
questions_df = questions_df[["qid", "question_en", "options_en"]]

# Merge CVQA answers with question metadata
cvqa_merged = cvqa_long.merge(questions_df, on="qid", how="left")

cvqa_merged.head()

Unnamed: 0,encounter_id,qid,answer_index,question_en,options_en
0,ENC00001,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me..."
1,ENC00002,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me..."
2,ENC00003,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me..."
3,ENC00004,CQID010-001,2,How much of the body is affected?,"[single spot, limited area, widespread, Not me..."
4,ENC00005,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me..."


In [8]:
# Map answer index to text
def get_answer_text(row):
    try:
        return row["options_en"][row["answer_index"]]
    except (IndexError, TypeError):
        return None

cvqa_merged["answer_text"] = cvqa_merged.apply(get_answer_text, axis=1)

# Final merge to attach query + image context
final_df = cvqa_merged.merge(train_df, on="encounter_id", how="left")

In [9]:
final_df.head()

Unnamed: 0,encounter_id,qid,answer_index,question_en,options_en,answer_text,author_id,image_ids,responses,query_title_en,query_content_en,image_paths,responses_en
0,ENC00001,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",limited area,U04473,"[IMG_ENC00001_00001.jpg, IMG_ENC00001_00002.jpg]","[{'author_id': 'U00217', 'content_zh': '银屑病，似与...",Pleural effusion accompanied by rash,A patient with pleural effusion is accompanied...,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,[Psoriasis seems to have no relation to pleura...
1,ENC00002,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",limited area,U06063,"[IMG_ENC00002_00001.jpg, IMG_ENC00002_00002.jp...","[{'author_id': 'U11305', 'content_zh': '脚气', '...",What is on the bottom of the right foot?,"The patient is a 50-year-old male, who has bee...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,[Beriberi]
2,ENC00003,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",limited area,U00780,"[IMG_ENC00003_00001.jpg, IMG_ENC00003_00002.jp...","[{'author_id': 'U01131', 'content_zh': '瘙痒症，有无...",Interpreting Images - Is it magical skin?,"Male, 65 years old, skin lesions as shown in t...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,"[Pruritus, is there any other special medical ..."
3,ENC00004,CQID010-001,2,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",widespread,U00209,"[IMG_ENC00004_00001.jpg, IMG_ENC00004_00002.jpg]","[{'author_id': 'U06715', 'content_zh': '肢端角化病？...",Skin Disease,"Male, 15 years old, keratosis on both palms, s...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,"[Acrokeratosis?, Progressive Symmetrical Eryth..."
4,ENC00005,CQID010-001,1,How much of the body is affected?,"[single spot, limited area, widespread, Not me...",limited area,U09050,[IMG_ENC00005_00001.jpg],"[{'author_id': 'U09402', 'content_zh': '是否神经性皮...",Perifollicular atrophy?,"Young female, silver-gray dot-like atrophy spo...",[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,"[Is it neurodermatitis?, Impotence?, Lichen Sc..."


In [10]:
len(final_df)

8100

In [11]:
# Define the path
csv_path = os.path.normpath(os.path.join("..", "2025_dataset", "train", "final_df.csv"))

# Read the file
final_df_2 = pd.read_csv(csv_path)

# Get the length
len(final_df_2)

8100

In [18]:
diff = final_df.compare(final_df_2, keep_shape=True, keep_equal=False)
diff

Unnamed: 0_level_0,encounter_id,encounter_id,qid,qid,answer_index,answer_index,question_en,question_en,options_en,options_en,...,responses,responses,query_title_en,query_title_en,query_content_en,query_content_en,image_paths,image_paths,responses_en,responses_en
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
0,,,,,,,,,"[single spot, limited area, widespread, Not me...","['single spot', 'limited area', 'widespread', ...",...,"[{'author_id': 'U00217', 'content_zh': '银屑病，似与...","[{'author_id': 'U00217', 'content_zh': '银屑病，似与...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,[Psoriasis seems to have no relation to pleura...,['Psoriasis seems to have no relation to pleur...
1,,,,,,,,,"[single spot, limited area, widespread, Not me...","['single spot', 'limited area', 'widespread', ...",...,"[{'author_id': 'U11305', 'content_zh': '脚气', '...","[{'author_id': 'U11305', 'content_zh': '脚气', '...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,[Beriberi],['Beriberi']
2,,,,,,,,,"[single spot, limited area, widespread, Not me...","['single spot', 'limited area', 'widespread', ...",...,"[{'author_id': 'U01131', 'content_zh': '瘙痒症，有无...","[{'author_id': 'U01131', 'content_zh': '瘙痒症，有无...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,"[Pruritus, is there any other special medical ...","['Pruritus, is there any other special medical..."
3,,,,,,,,,"[single spot, limited area, widespread, Not me...","['single spot', 'limited area', 'widespread', ...",...,"[{'author_id': 'U06715', 'content_zh': '肢端角化病？...","[{'author_id': 'U06715', 'content_zh': '肢端角化病？...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,"[Acrokeratosis?, Progressive Symmetrical Eryth...","['Acrokeratosis?', 'Progressive Symmetrical Er..."
4,,,,,,,,,"[single spot, limited area, widespread, Not me...","['single spot', 'limited area', 'widespread', ...",...,"[{'author_id': 'U09402', 'content_zh': '是否神经性皮...","[{'author_id': 'U09402', 'content_zh': '是否神经性皮...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,"[Is it neurodermatitis?, Impotence?, Lichen Sc...","['Is it neurodermatitis?', 'Impotence?', 'Lich..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8095,,,,,,,,,"[smooth, rough, Not mentioned]","['smooth', 'rough', 'Not mentioned']",...,"[{'author_id': 'U09522', 'content_zh': '多形红斑',...","[{'author_id': 'U09522', 'content_zh': '多形红斑',...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,"[Erythema Multiforme, Erythema Multiforme, Pap...","['Erythema Multiforme', 'Erythema Multiforme',..."
8096,,,,,,,,,"[smooth, rough, Not mentioned]","['smooth', 'rough', 'Not mentioned']",...,"[{'author_id': 'U00590', 'content_zh': '根据图片是血...","[{'author_id': 'U00590', 'content_zh': '根据图片是血...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,"[According to the picture, it is a vascular ne...","['According to the picture, it is a vascular n..."
8097,,,,,,,,,"[smooth, rough, Not mentioned]","['smooth', 'rough', 'Not mentioned']",...,"[{'author_id': 'U01379', 'content_zh': '扁平苔藓？'...","[{'author_id': 'U01379', 'content_zh': '扁平苔藓？'...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,"[Lichen Planus?, Bullous LP, It might be liche...","['Lichen Planus?', 'Bullous LP', 'It might be ..."
8098,,,,,,,,,"[smooth, rough, Not mentioned]","['smooth', 'rough', 'Not mentioned']",...,"[{'author_id': 'U16886', 'content_zh': '疼么？图上看...","[{'author_id': 'U16886', 'content_zh': '疼么？图上看...",,,,,[c:\Users\karishma\OneDrive\Projects\mediqa-ma...,['C:\\Users\\karishma\\OneDrive\\Projects\\med...,[Does it hurt? It looks a bit red and swollen ...,['Does it hurt? It looks a bit red and swollen...
