In [2]:
import json
import pandas as pd
import os
from PIL import Image
from IPython.display import display, Image as IPImage
import google.generativeai as genai
from dotenv import load_dotenv

In [5]:
# Load metadata for training data
json_file = os.path.join("2024_dataset", "train_downloaded.json")
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

In [6]:
# Define paths
images_dir = os.path.join("2024_dataset", "images", "train")
script_path = os.path.join("2024_dataset", "download_data.py")

In [13]:
# Combine all components of the JSON file into a DataFrame with separate response columns
train_data = []

for entry in data:
    image_path = os.path.join(images_dir, f"{entry['encounter_id']}.jpg")
    
    if os.path.exists(image_path):
        for response in entry["responses"]:
            train_data.append({
                "encounter_id": entry["encounter_id"],
                "image_path": image_path, 
                "query_title": entry["query_title_en"],
                "query_content": entry["query_content_en"],
                "author_id": response["author_id"],
                "response_content": response["content_en"],
                "completeness": response["completeness"],
                "contains_freq_ans": response["contains_freq_ans"]
            })

# Convert to DataFrame
train_df = pd.DataFrame(train_data)
display(train_df.head())

Unnamed: 0,encounter_id,image_path,query_title,query_content,author_id,response_content,completeness,contains_freq_ans
0,ih99w9,2024_dataset\images\train\ih99w9.jpg,I had this mole that appeared under my eye abo...,,annotator1,"Most probably it is a case of inflamed pimple,...",1,1
1,11n62qx,2024_dataset\images\train\11n62qx.jpg,Spot in my hairline,,annotator1,"Most probably it is a case of solar lentigo, a...",1,1
2,vk578x,2024_dataset\images\train\vk578x.jpg,Can someone help me ID this random bump? Today...,,annotator1,Most propably it is a case of a cyst in the gr...,1,1
3,123bko0,2024_dataset\images\train\123bko0.jpg,Rough bump/spot on the side of finger? What ex...,,annotator1,It is a case of common wart. This noncancerous...,1,1
4,11m0l9c,2024_dataset\images\train\11m0l9c.jpg,Puffy line on face any suggestions?,,annotator1,Most probably it is a case of deep coarse wrin...,1,1


In [14]:
len(train_df)

213

In [12]:
# Combine all components of the JSON file into a DataFrame for missing images
train_data_missing = []

for entry in data:
    image_path = os.path.join(images_dir, f"{entry['encounter_id']}.jpg")
    
    if not os.path.exists(image_path):
        for response in entry["responses"]:
            train_data_missing.append({
                "encounter_id": entry["encounter_id"],
                "image_path": image_path,
                "query_title": entry["query_title_en"],
                "query_content": entry["query_content_en"],
                "author_id": response["author_id"],
                "response_content": response["content_en"],
                "completeness": response["completeness"],
                "contains_freq_ans": response["contains_freq_ans"]
            })

# Convert to DataFrame
train_df_missing = pd.DataFrame(train_data_missing)
display(train_df_missing.head())

Unnamed: 0,encounter_id,image_path,query_title,query_content,author_id,response_content,completeness,contains_freq_ans
0,11mk4th,2024_dataset\images\train\11mk4th.jpg,M21 Bump? has persisted for over a year now ev...,[deleted],annotator1,Most probably it is a case of Milia. A milium ...,1,1
1,125gfgg,2024_dataset\images\train\125gfgg.jpg,Maybe a pimple under the skin,[deleted],annotator1,Most probably this is a case of blind pimple. ...,1,1
2,zmsedx,2024_dataset\images\train\zmsedx.jpg,Random ring of blister appeared overnight - no...,[deleted],annotator1,This is a case of contact eczema most probably...,1,1
3,11m079w,2024_dataset\images\train\11m079w.jpg,[deleted by user],[removed],annotator1,"Cold, dry weather, sun damage, and frequently ...",1,1
4,z8b11d,2024_dataset\images\train\z8b11d.jpg,Can anyone identify this on my husband’s face?...,[deleted],annotator1,It is a case of rosacea. Rosacea is a chronic ...,1,1


In [15]:
len(train_df_missing)

134

In [19]:
# Get unique encounter ids in the missing images dataframe
missing_encounter_ids = train_df_missing["encounter_id"].unique()
len(missing_encounter_ids)

134

In [16]:
# How many files are in the images directory
len(os.listdir(images_dir))

233

In [20]:
# Get unique encounter_id count
unique_encounter_ids = len(set(entry["encounter_id"] for entry in data))
print(f"Number of unique encounter_ids: {unique_encounter_ids}")

Number of unique encounter_ids: 347
