In [None]:
import json
import pandas as pd
from tqdm import tqdm
import os
from glob import glob

In [None]:
with open('CXR-Reason_image_filenames') as f:
    image_filenames=[i.strip() for i in f.readlines()]

# Golden Dataset

In [None]:
df=pd.read_csv('./gold_dataset/gold_attributes_relations_500pts_500studies1st.txt', sep='\t')
attributes_gold_df=df[df.categoryID.apply(lambda x: True if x in ['anatomicalfinding','disease'] else False)]

In [None]:
filtered_data=[]
for idx, data in tqdm(attributes_gold_df.iterrows(), total=len(attributes_gold_df)):
    patient='p'+str(data['patient_id'])
    study='s'+str(data['study_id'])
    image=data['image_id'].replace('.dcm','.jpg')
    image_path=f'{patient[:3]}/{patient}/{study}/{image}'
    if 'files/'+image_path in image_filenames:
        filtered_data.append(data)

In [None]:
attributes_gold_df=pd.DataFrame(filtered_data)

In [None]:
patient_level_findings={}
patient_level_disease={}
for idx, row in attributes_gold_df.iterrows():
    patient_id=row['patient_id']
    if patient_id not in patient_level_findings:
        patient_level_findings[patient_id]=[]
        patient_level_disease[patient_id]=[]

    if row['categoryID']=='disease':
        patient_level_disease[patient_id].append(row['label_name']+"+"+row['context'])
    else:
        if row['context']=='yes':
            patient_level_findings[patient_id].append([row['bbox'],row['label_name']])

patient_level_disease={k:[vi.split('+') for vi in list(set(v))] for k,v in patient_level_disease.items() if len(v)>0}
patient_level_findings={k:v for k,v in patient_level_findings.items() if len(v)>0}
patient_level_findings={k:v for k,v in patient_level_findings.items() if k in patient_level_disease}

questions=[]
for k, v in patient_level_disease.items():
    if k not in patient_level_findings:
        continue
    for vi in v:
        disease=vi[0]
        yesorno=vi[1]
        question=f'Based on the given chest X-ray, does this patient have {disease}?'
        questions.append({'patient_id':k,'question':question, 'type':'no_findings', 'type_specific':'no_findings', 'answer': yesorno})
        temp_question=''

        for finding in list(set([i[1] for i in patient_level_findings[k]])):
            temp_question=temp_question+f'The patient has {finding}. '
            questions.append({'patient_id':k,'question':f'The patient has {finding}. '+question, 'type':'finding', 'type_specific':finding, 'answer': yesorno})
        questions.append({'patient_id':k,'question':temp_question+question, 'type':'all_findings', 'type_specific':'all_findings', 'answer': yesorno})
        
        for findings in patient_level_findings[k]:
            anatomy=findings[0]
            finding=findings[1]
            questions.append({'patient_id':k,'question':f'The patient has {finding} at {anatomy}. '+question, 'type':'finding+anatomy', 'type_specific':finding+"+"+anatomy, 'answer': yesorno})

In [None]:
def process_dict(di, image_path, idx):
    return {'image': image_path, 'question_type':di['type'], 'question_type_specific':di['type_specific'], 'sys': "",
            'question_id': idx, 'question': di['question'].strip(),'answer': di['answer'].strip(), 'conversations': [
                {'from': 'human', 'value': '<image>\n'+di['question']},
                {'from': 'gpt', 'value': di['answer'].strip()}
            ]}

In [None]:
patient_ids=attributes_gold_df[['patient_id',	'study_id',	'image_id']].drop_duplicates()
patient_ids.set_index('patient_id',inplace=True)
final_questions=[]
for idx, q in enumerate(questions):
    patient='p'+str(q['patient_id'])
    study='s'+str(patient_ids.loc[q['patient_id']]['study_id'])
    image=patient_ids.loc[q['patient_id']]['image_id'].replace('.dcm','.jpg')
    image_path=f'{patient[:3]}/{patient}/{study}/{image}'

    final_questions.append(process_dict(q, image_path, idx))

final_questions=pd.DataFrame(final_questions)
final_questions=final_questions.groupby(['image','question','question_type']).first()
final_questions.reset_index(inplace=True)
final_questions=final_questions.set_index('question_id').reset_index(drop=True).reset_index().rename(columns={'index':'question_id'}).to_dict(orient='records')

In [None]:
with open('CXR-Reason-Golden.jsonl', 'w') as f:
    f.write(json.dumps(final_questions))

# Silver Dataset

In [None]:
candidates=[]
for fp in tqdm(glob('./silver_dataset/scene_graph/*.json')):
    with open(fp) as f:
        text=f.read()
    if 'disease|' in text:
        data=json.loads(text)    
        patient='p'+str(data['patient_id'])
        study='s'+str(data['study_id'])
        image=data['image_id']
        image_path=f'{patient[:3]}/{patient}/{study}/{image}.jpg'
        if 'files/'+image_path in image_filenames:
            candidates.append(fp)

In [None]:
len(candidates)

In [None]:
def process_dict(di, image_path, idx):
    return {'image': image_path, 'question_type':di['type'], 'question_type_specific':di['type_specific'], 'sys': "",
            'question_id': idx, 'question': di['question'].strip(),'answer': di['answer'].strip(), 'conversations': [
                {'from': 'human', 'value': '<image>\n'+di['question']},
                {'from': 'gpt', 'value': di['answer'].strip()}
            ]}

In [None]:
questions=[]
idx=0
for fp in tqdm(candidates):
    with open(fp) as f:
        data=json.loads(f.read())
    patient='p'+str(data['patient_id'])
    study='s'+str(data['study_id'])
    image=data['image_id']
    image_path=f'{patient[:3]}/{patient}/{study}/{image}.jpg'
    
    diseases=[]
    findings=[]
    for a in data['attributes']:
        anatomy=a['bbox_name']
        for i in a['attributes']:
            for j in i:
                if j.startswith('disease'):
                    diseases.append(j.split('|')[-1].strip()+'+'+j.split('|')[1].strip())
                if j.startswith('anatomicalfinding'):
                    if '|yes|' in j:
                        findings.append(anatomy+'+'+j.split('|yes|')[-1].strip())

    diseases=[vi.split('+') for vi in list(set(diseases))]
    findings=[vi.split('+') for vi in list(set(findings))]

    if len(findings)>0:
        for vi in diseases:
            disease=vi[0]
            yesorno=vi[1]
            question=f'Based on the given chest X-ray, does this patient have {disease}?'
            
            questions.append(process_dict({'question':question, 'type':'no_findings', 'type_specific':'no_findings', 'answer': yesorno},image_path,idx))
            idx+=1
            
            temp_question=''
            for finding in list(set([i[1] for i in findings])):
                temp_question=temp_question+f'The patient has {finding}. '
                questions.append(process_dict({'question':f'The patient has {finding}. '+question, 'type':'finding', 'type_specific':finding, 'answer': yesorno},image_path,idx))
                idx+=1
            questions.append(process_dict({'question':temp_question+question, 'type':'all_findings', 'type_specific':'all_findings', 'answer': yesorno},image_path,idx))
            idx+=1
            
            for _finding in findings:
                anatomy=_finding[0]
                finding=_finding[1]
                questions.append(process_dict({'question':f'The patient has {finding} at {anatomy}. '+question, 'type':'finding+anatomy', 'type_specific':finding+"+"+anatomy, 'answer': yesorno},image_path,idx))
                idx+=1

In [None]:
random_state=989

In [None]:
questions=pd.DataFrame(questions)
questions=questions.groupby(['image','question','question_type']).first()
questions.reset_index(inplace=True)
questions['disease']=questions.conversations.apply(lambda x: x[0]['value'].split('this patient have')[-1].rstrip('?').strip())
no_findings=questions[questions.question_type=='no_findings']
all_findings=questions[questions.question_type=='all_findings']

no_findings_sampled=no_findings.groupby(['image','disease']).sample(1, random_state=random_state)
all_findings_sampled=all_findings.groupby(['image','disease']).sample(1, random_state=random_state)
findings=pd.DataFrame([a for _, a in questions.iterrows() if a['question_type']=='finding'])
findings_sampled=findings.groupby(['image','disease']).sample(1, random_state=random_state)
anatomical_questions=pd.DataFrame([a for _, a in questions.iterrows() if a['question_type']=='finding+anatomy'])
anatomical_questions_sampled=anatomical_questions.groupby(['image','disease']).sample(1, random_state=random_state)

In [None]:
sampled_df=pd.concat([no_findings_sampled,all_findings_sampled,findings_sampled,anatomical_questions_sampled])
remove_index_list=[j for i in sampled_df.groupby(['image','disease'])['question_id'].agg(list)[sampled_df.groupby(['image','disease']).count()['question']<4].values for j in i]
sampled_df.set_index('question_id',inplace=True)
sampled_df.drop(index=remove_index_list,inplace=True)

In [None]:
with open('IMAGE_EXCEPTION.txt') as f:
    image_exception=[i.strip() for i in f.readlines()]

In [None]:
remove_index_list=[]
for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    if row['image'] in image_exception:
        remove_index_list.append(idx)
sampled_df.drop(index=remove_index_list,inplace=True)

In [None]:
final_questions=sampled_df.reset_index(drop=True).reset_index().rename(columns={'index':'question_id'}).to_dict(orient='records')

In [None]:
with open('CXR-Reason-Silver.jsonl', 'w') as f:
    f.write(json.dumps(final_questions))