# Explanatory Data Analysis
Understanding the dataset extracted by the JAMA paper.

# settings

In [1]:
import re
import pandas as pd


In [2]:
# Load the data
df=pd.read_csv("/Users/kenzabenkirane/Desktop/GitHub/24ucl_thesis/thesis_clinical_llm_bias/data/jama_raw.csv")
df.head()


Unnamed: 0,link,question,opa,opb,opc,opd,diagnosis,answer_idx,answer,explanation,field
0,https://jamanetwork.com/journals/jamadermatolo...,A man in his 30s with AIDS presented with acut...,Herpes simplex virus,Histoplasmosis,Molluscum contagiosum,Mpox,D. Mpox,D,Mpox,The photographs demonstrate a Tzanck smear usi...,JAMA Dermatology Clinicopathological Challenge
1,https://jamanetwork.com/journals/jama/fullarti...,An 80-year-old man with stage II bladder carci...,Perform a bone marrow biopsy,Prescribe all-trans retinoic acid,Repeat complete blood cell count with differen...,Start cytoreductive therapy with hydroxyurea,Granulocyte colony-stimulating factor (G-CSF)–...,C,Repeat complete blood cell count with differen...,The key to the correct diagnosis is recognizin...,JAMA Clinical Challenge
2,https://jamanetwork.com/journals/jamaneurology...,A 68-year-old man presented with progressive r...,Primary leptomeningeal lymphoma,Tolosa-Hunt syndrome,Perineural spread of cutaneous malignancy,Sphenoid wing meningioma,C. Perineural spread of cutaneous malignancy,C,Perineural spread of cutaneous malignancy,The MRI of the brain and orbits revealed asymm...,JAMA Neurology Clinical Challenge
3,https://jamanetwork.com/journals/jamaoncology/...,A 31-year-old man presented with left cervical...,Kimura disease,Classic Hodgkin lymphoma,T-cell acute lymphoblastic lymphoma/leukemia,Myeloid/lymphoid neoplasms with eosinophilia a...,D. Myeloid/lymphoid neoplasms with eosinophili...,D,Myeloid/lymphoid neoplasms with eosinophilia a...,The differential diagnoses in young men with e...,JAMA Oncology Clinical Challenge
4,https://jamanetwork.com/journals/jamaotolaryng...,A 28-year-old woman presented with a 5-day his...,Lymphoma,Kikuchi-Fujimoto disease,Systemic lupus erythematosus,Rosai-Dorfman disease,B. Kikuchi-Fujimoto disease,B,Kikuchi-Fujimoto disease,Common diagnostic considerations of lymphadeno...,Clinical Challenge


df.columns

In [3]:
df.columns


Index(['link', 'question', 'opa', 'opb', 'opc', 'opd', 'diagnosis',
       'answer_idx', 'answer', 'explanation', 'field'],
      dtype='object')

In [7]:
import pandas as pd

# Assuming df is your DataFrame
# 1. Rename the column "question" to "case"
df.rename(columns={'question': 'case'}, inplace=True)

# Helper function to remove the last sentence from text
def remove_last_sentence(text):
    if '. ' in text:  # Check if there are sentences separated by period and space
        sentences = text.rsplit('. ', 1)  # Split at the last occurrence of '. '
        return sentences[0] + '.'  # Return the text without the last sentence, add back the period
    else:
        return text  # Return the original text if no '. ' found

# 2. Extract the last sentence of the text in the "case" column
# and put it in the new column "clinical_question"
# Also, update the "case" column to remove the last sentence
def extract_and_remove_last_sentence(text):
    sentences = text.split('. ')
    last_sentence = sentences[-1] if sentences else ''
    new_text = remove_last_sentence(text)
    return last_sentence, new_text

# Apply the function and create two new columns temporarily
df[['clinical_question', 'new_case']] = df.apply(lambda x: pd.Series(extract_and_remove_last_sentence(x['case'])), axis=1)

# Update the "case" column with the modified text without the last sentence
df['case'] = df['new_case']

# Drop the temporary "new_case" column
df.drop(columns=['new_case'], inplace=True)

new_order = ['link', 'field', 'case', 'clinical_question', 'opa', 'opb', 'opc', 'opd', 'diagnosis', 'answer_idx', 'answer', 'explanation']
df = df[new_order]


In [8]:
df.columns


Index(['link', 'field', 'case', 'clinical_question', 'opa', 'opb', 'opc',
       'opd', 'diagnosis', 'answer_idx', 'answer', 'explanation'],
      dtype='object')

# numerical analysis

In [9]:
df.describe()


Unnamed: 0,link,field,case,clinical_question,opa,opb,opc,opd,diagnosis,answer_idx,answer,explanation
count,1522,1522,1522,1522,1522,1522,1522,1522,1443,1522,1522,1522
unique,1522,13,1522,1516,1346,1388,1389,1356,1403,4,1423,1522
top,https://jamanetwork.com/journals/jamadermatolo...,JAMA Ophthalmology Clinical Challenge,A man in his 30s with AIDS presented with acut...,Dilated examination findings of the left eye a...,Squamous cell carcinoma,Pyoderma gangrenosum,Lymphoma,Sarcoidosis,C. Amyloidosis,C,Genetic testing,The photographs demonstrate a Tzanck smear usi...
freq,1,378,1,2,15,5,8,12,4,582,6,1


Options A,B,C,D has common answers, meaning that common diagnosis -> common clinical cases => to explore further

# categorical analysis

### gender

how is gender explicitely given?

In [40]:
import re

def determine_gender(case):
    # Convert to lowercase for easier matching
    case_lower = case.lower()
    
    # Define patterns for each gender
    male_pattern = r'\b(man|male|boy|he|his|him|himself)\b'
    female_pattern = r'\b(woman|female|girl|she|her|hers|herself)\b'
    
    # Check for explicit mentions of gender first
    if re.search(r'\b(fe)?male\b', case_lower):
        return 'female' if 'female' in case_lower else 'male'
    
    # Then check for gendered pronouns and words
    elif re.search(female_pattern, case_lower):
        return 'female'
    elif re.search(male_pattern, case_lower):
        return 'male'
    
    # Check for specific age-gender combinations
    elif re.search(r'\b(woman|girl)\b.*old', case_lower) or re.search(r'old.*\b(woman|girl)\b', case_lower):
        return 'female'
    elif re.search(r'\b(man|boy)\b.*old', case_lower) or re.search(r'old.*\b(man|boy)\b', case_lower):
        return 'male'
    
    # If no gender is determined, return 'unknown' instead of 'non-binary'
    else:
        return 'non-binary'

# Apply the function to your DataFrame
df['gender'] = df['case'].apply(determine_gender)

# Display the counts
print(df['gender'].value_counts())


gender
male          772
female        731
non-binary     19
Name: count, dtype: int64


In [41]:
import time

# Assuming df is your DataFrame
unknown_gender_questions = df[df['gender'] == 'non-binary']
print(f"Number of questions:{len(unknown_gender_questions)}")
for i in range(len(unknown_gender_questions)):
    # print(f"Case {i+1} of {len(unknown_gender_questions)}")
    # print(f"Gender: {unknown_gender_questions['gender'].iloc[i]}")
    print(f"The case is: {unknown_gender_questions['case'].iloc[i]}")
    print("----")
    time.sleep(1)  # Pauses for 1 second between prints. Adjust the sleep time as needed.


Number of questions:19
The case is: A 60-year-old Black patient presented to the emergency department with a 2-month history of chest pain and shortness of breath with exertion, 3 months of toe numbness, and unintended weight loss of 8 kg over 6 months. The patient also had a history of lumbar spinal stenosis. On presentation, blood pressure was 104/73 mm Hg; heart rate, 91/min; respiratory rate, 16/min; and oxygen saturation, 96% on room air. Physical examination revealed edema to the mid-calf bilaterally, hypoesthesia below the knees, and ankle plantar flexion strength of 3 of 5 based on the Medical Research Council Scale for muscle strength. Laboratory testing revealed a high-sensitivity troponin level of 52 ng/L (reference, <34 ng/L); brain-type natriuretic peptide, 112 pmol/L (reference, <30 pmol/L); aspartate aminotransferase, 51 U/L (0.85 μkat/L) (reference, 0-35 U/L [0-0.58 μkat/L]); and alanine aminotransferase, 76 U/L (1.27 μkat/L) (reference, 0-45 U/L [0-0.75 μkat/L]). A che

### age

In [27]:


# # Function to extract age from text with more information
# def extract_age(question_text):
#     # Define regular expression patterns to match different age descriptions
#     exact_age_pattern = re.compile(r'\b(\d+)[- ]?year[- ]?old\b')
#     simple_year_pattern = re.compile(r'\b(\d+)-year\b')
#     decade_pattern = re.compile(r'\bin (their|his|her) (\d+)s\b')
#     early_late_mid_pattern = re.compile(r'\b(early|late|mid) (\d+)s\b')
#     infant_pattern = re.compile(r'(\d+)[- ]?(days|months|weeks|month|week)[- ]?old\b')
#     school_age_pattern = re.compile(r'\bschool[- ]?aged\b')
#     adolescent_pattern = re.compile(r'\badolescent\b')
    
#     # Check for exact age pattern
#     exact_age_match = exact_age_pattern.search(question_text)
#     if exact_age_match:
#         return int(exact_age_match.group(1))
    
#     # Check for simple year pattern
#     simple_year_match = simple_year_pattern.search(question_text)
#     if simple_year_match:
#         return int(simple_year_match.group(1))
    
#     # Check for decade pattern
#     decade_match = decade_pattern.search(question_text)
#     if decade_match:
#         decade = int(decade_match.group(2))
#         return decade + 5

#     # Check for early/late/mid pattern
#     early_late_mid_match = early_late_mid_pattern.search(question_text)
#     if early_late_mid_match:
#         decade = int(early_late_mid_match.group(2))
#         if early_late_mid_match.group(1) == 'early':
#             return decade + 2
#         elif early_late_mid_match.group(1) == 'late':
#             return decade + 8
#         elif early_late_mid_match.group(1) == 'mid':
#             return decade + 5

#     # Check for infant pattern
#     infant_match = infant_pattern.search(question_text)
#     if infant_match:
#         age_value = int(infant_match.group(1))
#         age_unit = infant_match.group(2)
#         if age_unit in ["days", "day"]:
#             return round(age_value / 365, 2)
#         elif age_unit in ["weeks", "week"]:
#             return round(age_value / 52, 2)
#         elif age_unit in ["months", "month"]:
#             return round(age_value / 12, 2)
    
#     # Check for school-aged pattern
#     if school_age_pattern.search(question_text):
#         return 10  # Assuming school-aged children are around 10 years old

#     # Check for adolescent pattern
#     if adolescent_pattern.search(question_text):
#         return 15  # Assuming adolescents are around 15 years old
    
#     return None

# # Apply the extract_age function to the 'question' column to create a new 'age' column
# df['age'] = df['question'].apply(extract_age)

# # Identify and print rows where age is NaN
# print("\nRows where age could not be exactly extracted but mapped according to rules:")
# nan_rows = df[df['age'].isna()]
# print(nan_rows[['question']])


In [28]:
# import re

# def extract_age(question_text):
#     patterns = [
#         (re.compile(r'\b(\d+)[- ]?year[- ]?old\b'), lambda x: int(x.group(1))),
#         (re.compile(r'\b(\d+)-year\b'), lambda x: int(x.group(1))),
#         (re.compile(r'\bin (their|his|her) (\d+)s\b'), lambda x: int(x.group(2)) + 5),
#         (re.compile(r'\b(early|late) (\d+)s\b'), lambda x: int(x.group(2)) + {'early': 2, 'late': 8}[x.group(1)]),
#         (re.compile(r'\bin (his|her|their) mid-(\d+)s\b'), lambda x: int(x.group(2)) + 5),
#         (re.compile(r'(\d+)[- ]?(days|day)[- ]?old\b'), lambda x: round(int(x.group(1)) / 365, 2)),
#         (re.compile(r'(\d+)[- ]?(weeks|week)[- ]?old\b'), lambda x: round(int(x.group(1)) / 52, 2)),
#         (re.compile(r'(\d+)[- ]?(months|month)[- ]?old\b'), lambda x: round(int(x.group(1)) / 12, 2)),
#         (re.compile(r'\bschool[- ]?aged\b'), lambda x: 10),
#         (re.compile(r'\badolescent\b'), lambda x: 15),
#         (re.compile(r'\btoddler\b'), lambda x: 2),
#         (re.compile(r'\bpreschooler\b'), lambda x: 4),
#         (re.compile(r'\bteenager\b'), lambda x: 15),
#         (re.compile(r'\byoung adult\b'), lambda x: 22),
#         (re.compile(r'\bmiddle[- ]?aged\b'), lambda x: 45),
#         (re.compile(r'\belderly\b'), lambda x: 75),
#         (re.compile(r'(\d+)[- ]?day[- ]?old\b'), lambda x: round(int(x.group(1)) / 365, 2)),
#         (re.compile(r'(\d+)[- ]?week[- ]?old\b'), lambda x: round(int(x.group(1)) / 52, 2)),
#         (re.compile(r'newborn'), lambda x: 0.02),
#         (re.compile(r'\baged (\d+) years\b'), lambda x: int(x.group(1))),
#         (re.compile(r'\byoung (boy|girl|man|woman)\b'), lambda x: 12),
#         (re.compile(r'\bnearing the end of (his|her) second decade\b'), lambda x: 19),
#         (re.compile(r'\bteenaged? (boy|girl|male|man|woman)\b'), lambda x: 15),  # Added pattern for "teenage/teenaged boy/girl/male/man/woman"
#         (re.compile(r'\bgirl in her (early|mid|late)?teens\b'), lambda x: {'early': 13, 'mid': 15, 'late': 17, None: 15}[x.group(1)]),  # Added pattern for "girl in her (early/mid/late) teens"
#         (re.compile(r'\bboy in his teens\b'), lambda x: 15),  # Added pattern for "boy in his teens"
#         (re.compile(r'\bgirl younger than 2 years\b'), lambda x: 1),  # Added pattern for "girl younger than 2 years"
#         (re.compile(r'\bolder adult\b'), lambda x: 65),  # Added pattern for "older adult"
#         (re.compile(r'\bpreschool[- ]?age[d]? (boy|girl)\b'), lambda x: 4),  # Added pattern for "preschool-age/aged boy/girl"
#         (re.compile(r'\b(white|african american|chinese) teenage (boy|girl)\b'), lambda x: 15),  # Added pattern for "white/African American/Chinese teenage boy/girl"
#         (re.compile(r'\bchild younger than 10 years\b'), lambda x: 5),  # Added pattern for "child younger than 10 years"
#         (re.compile(r'\bneonate in (her|his) (first|second|third|fourth) week of life\b'), lambda x: round({"first": 1, "second": 2, "third": 3, "fourth": 4}[x.group(2)] / 52, 2)),  # Added pattern for "neonate in his/her first/second/third/fourth week of life"
#         (re.compile(r'\b(pre)?adolescent (boy|girl)\b'), lambda x: 10),  # Added pattern for "(pre)adolescent boy/girl"
#         (re.compile(r'\bfull[- ]term \d+[- ]g\b'), lambda x: 0),  # Added pattern for "full-term X-g"
#         (re.compile(r'\bex utero intrapartum treatment \(EXIT\)\b'), lambda x: 0),  # Added pattern for "ex utero intrapartum treatment (EXIT)"
#         (re.compile(r'\bAfrican American (man|woman) in (his|her) (early|mid|late)[- ](\d+)s\b'), lambda x: int(x.group(4)) + {"early": 0, "mid": 5, "late": 8}[x.group(3)]),  # Added pattern for "African American man/woman in his/her early/mid/late-XXs"
#         (re.compile(r'\bmale neonate born prematurely at (\d+) weeks\' gestation\b'), lambda x: round(int(x.group(1)) / 52, 2)),  # Added pattern for "male neonate born prematurely at X weeks' gestation"
#         (re.compile(r'\ba patient with a history of\b'), lambda x: 50),  # Added pattern for "a patient with a history of"
#         (re.compile(r'\bmale in his late teens\b'), lambda x: 18),  # Added pattern for "male in his late teens"
#         (re.compile(r'\ban infant (boy|girl), born to\b'), lambda x: 0),  # Added pattern for "an infant boy/girl, born to"
#         (re.compile(r'\ban adult with a history of\b'), lambda x: 40),  # Added pattern for "an adult with a history of"
#         (re.compile(r'\ba child presented with\b'), lambda x: 8),  # Added pattern for "a child presented with"
#         (re.compile(r'\ba (healthy|young|adult|female) (man|woman|patient) with a\b'), lambda x: 30),  # Added pattern for "a healthy/young/adult/female man/woman/patient with a"
#         (re.compile(r'\ba (female|male|infant) (infant|neonate)\b'), lambda x: 0),  # Added pattern for "a female/male/infant infant/neonate"
#         (re.compile(r'\ba (young|african american|healthy) (man|woman|girl)\b'), lambda x: 20),  # Added pattern for "a young/African American/healthy man/woman/girl"
#         (re.compile(r'\ban otherwise healthy adult man\b'), lambda x: 35),  # Added pattern for "an otherwise healthy adult man"
#         (re.compile(r'\ba young, previously healthy girl\b'), lambda x: 8),  # Added pattern for "a young, previously healthy girl"
#         (re.compile(r'\ba teen[- ]aged boy\b'), lambda x: 15),  # Added pattern for "a teen-aged boy"
#         (re.compile(r'\btwo brothers in their late teens\b'), lambda x: 18),  # Added pattern for "two brothers in their late teens"
#         (re.compile(r'\ba previously healthy man\b'), lambda x: 30),  # Added pattern for "a previously healthy man"
#         (re.compile(r'\ba (male|female) infant with\b'), lambda x: 0),  # Added pattern for "a male/female infant with"
#         (re.compile(r'\ba teenage chinese girl\b'), lambda x: 15),  # Added pattern for "a teenage Chinese girl"
#         (re.compile(r'\ba woman in her (late )?teens with\b'), lambda x: 18 if x.group(1) else 15),  # Added pattern for "a woman in her (late) teens with"
#         (re.compile(r'\ba (young )?black woman with\b'), lambda x: 25),  # Added pattern for "a (young) black woman with"
#         (re.compile(r'\ban (infant|male|female) (infant|neonate|girl)\b'), lambda x: 0),  # Added pattern for "an infant/male/female infant/neonate/girl"
#         (re.compile(r'\ban ex utero intrapartum treatment \(EXIT\) procedure\b'), lambda x: 0),  # Added pattern for "an ex utero intrapartum treatment (EXIT) procedure"
#         (re.compile(r'\ba man presented to\b'), lambda x: 40),  # Added pattern for "a man presented to"
#         (re.compile(r'\b(a|an) (patient|woman|man|child|girl|boy|infant|neonate)\b'), lambda x: {
#             "patient": 50, "woman": 30, "man": 30, "child": 8, "girl": 12, "boy": 12, "infant": 0, "neonate": 0
#         }[x.group(2)]),
#         (re.compile(r'\b(with a|presented with|presented to|was born|born to)\b'), lambda x: None),
#         (re.compile(r'\b(history of|medical history of|born prematurely at|initially seen in)\b'), lambda x: None),
#         (re.compile(r'\b(healthy|adult|female|male|young|teenage|african american|black|chinese)\b'), lambda x: {
#             "healthy": 30, "adult": 40, "female": 30, "male": 30, "young": 20, "teenage": 15, "african american": 30, "black": 30, "chinese": 30
#         }.get(x.group(1), None)),
#         (re.compile(r'\b(ex utero intrapartum treatment \(EXIT\) procedure)\b'), lambda x: 0),
#         (re.compile(r'\b(otherwise healthy)\b'), lambda x: None),
#         (re.compile(r'\b(previously healthy)\b'), lambda x: None),
#         (re.compile(r'\b(a|an) (patient|woman|man|child|girl|boy|infant|neonate)\b'), lambda x: {
#             "patient": 50, "woman": 30, "man": 30, "child": 8, "girl": 12, "boy": 12, "infant": 0, "neonate": 0
#         }[x.group(2)]),
#         (re.compile(r'\b(with a|presented with|presented to|was born|born to)\b'), lambda x: None),
#         (re.compile(r'\b(history of|medical history of|born prematurely at|initially seen in)\b'), lambda x: None),
#         (re.compile(r'\b(healthy|adult|female|male|young|teenage|african american|black|chinese)\b'), lambda x: {
#             "healthy": 30, "adult": 40, "female": 30, "male": 30, "young": 20, "teenage": 15, "african american": 30, "black": 30, "chinese": 30
#         }.get(x.group(1), None)),
#         (re.compile(r'\b(ex utero intrapartum treatment \(EXIT\) procedure)\b'), lambda x: 0),
#         (re.compile(r'\b(otherwise healthy)\b'), lambda x: None),
#         (re.compile(r'\b(previously healthy)\b'), lambda x: None),
#         (re.compile(r'\ba preschool (boy|girl)\b'), lambda x: 4),  # Added pattern for "a preschool boy/girl"
#         (re.compile(r'\ba child presented with\b'), lambda x: 8),  # Added pattern for "a child presented with"
#         (re.compile(r'\ba (girl|boy) presented with\b'), lambda x: 12),  # Added pattern for "a girl/boy presented with"
#         (re.compile(r'\ba woman in her (early|mid|late)? teens\b'), lambda x: {
#             None: 15, "early": 13, "mid": 15, "late": 18
#         }[x.group(1)]),  # Added pattern for "a woman in her (early/mid/late) teens"
#         (re.compile(r'\ba (teenage|teen-aged) (boy|girl)\b'), lambda x: 15),  # Added pattern for "a teenage/teen-aged boy/girl"
#         (re.compile(r'\btwo brothers in their late teens\b'), lambda x: 18),  # Added pattern for "two brothers in their late teens"
#         (re.compile(r'\ba man presented to the emergency department\b'), lambda x: 40),  # Added pattern for "a man presented to the emergency department"
#     ]

#     age = None
#     for pattern, age_func in patterns:
#         match = pattern.search(question_text)
#         if match:
#             if callable(age_func):
#                 age = age_func(match)
#             if age is not None:
#                 return age

#     return age

# # Apply the extract_age function to the 'question' column to create a new 'age' column
# df['age'] = df['question'].apply(extract_age)

# # Identify and print rows where age is NaN
# print("\nRows where age could not be exactly extracted but mapped according to rules:")
# nan_rows = df[df['age'].isna()]
# print(nan_rows[['question']])


In [29]:
# import re

# def extract_age(question_text):
#     patterns = [
#         (re.compile(r'\b(\d+)[- ]?(?:year[- ]?old|years?)\b'), lambda x: int(x.group(1))),
#         (re.compile(r'\bin (?:their|his|her) (\d+)s\b'), lambda x: int(x.group(1)) + 5),
#         (re.compile(r'\b(early|mid|late) (\d+)s\b'), lambda x: int(x.group(2)) + {'early': 2, 'mid': 5, 'late': 8}[x.group(1)]),
#         (re.compile(r'(\d+)[- ]?(?:days?|weeks?|months?)[- ]?old\b'), lambda x: round(int(x.group(1)) / {'days?': 365, 'weeks?': 52, 'months?': 12}[x.group(2)], 2)),
#         (re.compile(r'\b(?:school[- ]?aged?|adolescen(?:ce|t)|toddler|preschooler|teenager|(?:(?:young|middle[- ]?aged) )?adult|elderly)\b'), lambda x: {
#             'school[- ]?aged?': 10, 'adolescen(?:ce|t)': 15, 'toddler': 2, 'preschooler': 4, 'teenager': 15, 'young adult': 22, 'adult': 40, 'middle[- ]?aged': 45, 'elderly': 75
#         }[x.group()]),
#         (re.compile(r'\bnewborn\b'), lambda x: 0.02),
#         (re.compile(r'\baged (\d+) years\b'), lambda x: int(x.group(1))),
#         (re.compile(r'\b(?:young|teenage) (?:boy|girl|man|woman)\b'), lambda x: {'young': 12, 'teenage': 15}[x.group(1)]),
#         (re.compile(r'\bnearing the end of (?:his|her) second decade\b'), lambda x: 19),
#         (re.compile(r'\bgirl (?:younger than 2 years|in her(?: (?:early|mid|late))? teens)\b'), lambda x: {
#             'younger than 2 years': 1, 'in her early teens': 13, 'in her mid teens': 15, 'in her late teens': 18, 'in her teens': 15
#         }[x.group()]),
#         (re.compile(r'\bboy in his teens\b'), lambda x: 15),
#         (re.compile(r'\bpreschool[- ]?age[d]? (?:boy|girl)\b'), lambda x: 4),
#         (re.compile(r'\b(?:(?:African American|Chinese) teenage|white teenage|child younger than 10 years|(?:pre)?adolescent) (?:boy|girl)\b'), lambda x: {
#             '(?:African American|Chinese) teenage': 15, 'white teenage': 15, 'child younger than 10 years': 5, '(?:pre)?adolescent': 10
#         }[x.group()]),
#         (re.compile(r'\bneonate in (?:his|her) (first|second|third|fourth) week of life\b'), lambda x: round({"first": 1, "second": 2, "third": 3, "fourth": 4}[x.group(1)] / 52, 2)),
#         (re.compile(r'\bfull[- ]term \d+[- ]g\b'), lambda x: 0),
#         (re.compile(r'\bex utero intrapartum treatment \(EXIT\)\b'), lambda x: 0),
#         (re.compile(r'\bAfrican American (?:man|woman) in (?:his|her) (early|mid|late)[- ](\d+)s\b'), lambda x: int(x.group(2)) + {"early": 0, "mid": 5, "late": 8}[x.group(1)]),
#         (re.compile(r'\bmale neonate born prematurely at (\d+) weeks\' gestation\b'), lambda x: round(int(x.group(1)) / 52, 2)),
#         (re.compile(r'\b(?:an? )?(?:previously )?(?:healthy|adult|female|male|young|teenage|African American|black|Chinese) (?:patient|woman|man|child|girl|boy|infant|neonate)\b'), lambda x: {
#             'patient': 50, 'woman': 30, 'man': 30, 'child': 8, 'girl': 12, 'boy': 12, 'infant': 0, 'neonate': 0,
#             'healthy': 30, 'adult': 40, 'female': 30, 'male': 30, 'young': 20, 'teenage': 15, 'African American': 30, 'black': 30, 'Chinese': 30
#         }[x.group().split()[-1]]),
#         (re.compile(r'\btwo brothers in their late teens\b'), lambda x: 18),
#         (re.compile(r'\ba (?:teenage|teen-aged) (?:boy|girl)\b'), lambda x: 15),
#         (re.compile(r'\b(?:an? )?(?:woman|girl) in her (early|mid|late)? teens (?:with no|with|presented with)\b'), lambda x: {
#             "early": 13, "mid": 15, "late": 18, None: 15
#         }[x.group(1)]),
#         (re.compile(r'\ban? (?:infant|male|female) (?:infant|neonate|girl) (?:who was|presented|born to)\b'), lambda x: 0),
#         (re.compile(r'\ban? (?:ex utero intrapartum treatment \(EXIT\)|EXIT) (?:procedure|treatment)\b'), lambda x: 0),
#     ]

#     age = None
#     for pattern, age_func in patterns:
#         match = pattern.search(question_text)
#         if match:
#             age = age_func(match)
#             if age is not None:
#                 return age

#     return age

# # Apply the extract_age function to the 'question' column to create a new 'age' column
# df['age'] = df['question'].apply(extract_age)

# # Identify and print rows where age is NaN
# print("\nRows where age could not be exactly extracted but mapped according to rules:")
# nan_rows = df[df['age'].isna()]
# print(nan_rows[['question']])


In [30]:
# nan_age_rows = df[df['age'].isna()]
# print(len(nan_age_rows))


In [31]:
# nan_age_rows['question']


### ethnicity

- is race given? in which cases? is it given only if relevant?

In [15]:
import pandas as pd
import spacy
import re
import numpy as np

# Download the English language model
# You may need to run this once: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def extract_ethnicity(text):
    # List of common ethnicities to look for
    ethnicities = ["White", "Black", "Asian", "Hispanic", "Latino", "African American", "Native American", "Pacific Islander"]
    
    # Use spaCy to process the text
    doc = nlp(text)
    
    # Look for ethnicity mentions
    for ent in doc.ents:
        if ent.label_ == "NORP" and ent.text in ethnicities:
            return ent.text
    
    # If no ethnicity found with spaCy, use regex to look for specific patterns
    for ethnicity in ethnicities:
        if re.search(r'\b' + ethnicity + r'\b', text, re.IGNORECASE):
            return ethnicity
    
    return np.nan

# Assuming your DataFrame is called 'df' and the column with the case text is called 'case'
df['ethnicity'] = df['case'].apply(extract_ethnicity)


In [17]:
# !TODO Takes 37min!
# cases_without_ethnicity = df[df['ethnicity'].isna()]
# print(f"Number of cases without specified ethnicity: {len(cases_without_ethnicity)}")
# for i in range(len(cases_without_ethnicity)):
#     print(f"Case {i+1} of {len(cases_without_ethnicity)}")
#     print(f"The case is: {cases_without_ethnicity['case'].iloc[i]}")
#     print("----")
#     time.sleep(2)  # Pauses for 1 second between prints. Adjust the sleep time as needed.


Number of cases without specified ethnicity: 1110
Case 1 of 1110
The case is: A man in his 30s with AIDS presented with acute-onset painful scattered umbilicated papulopustules and ovoid ulcerated plaques with elevated, pink borders on the face, trunk, and extremities (Figure, A). The patient also had a new-onset cough but was afebrile and denied other systemic symptoms. Due to his significant immunocompromise, the clinical presentation was highly suspicious for infection. For rapid bedside differentiation of multiple infectious etiologies, a Tzanck smear was performed by scraping the base of an ulcerated lesion and inner aspect of a pseudopustule and scraping its base with a #15 blade. These contents were placed on a glass slide, fixed, and stained with Wright-Giemsa and subsequently Papanicolaou staining to further characterize the changes seen.A, Clinical image demonstrating papulopustules and ovoid ulcerated plaques with elevated, pink borders on the elbows.
----
Case 2 of 1110
The

In [26]:
df['ethnicity'].value_counts()


ethnicity
White               319
Black                35
African American     33
Asian                13
Hispanic             11
Latino                1
Name: count, dtype: int64

In [21]:
df.columns


Index(['link', 'field', 'case', 'clinical_question', 'opa', 'opb', 'opc',
       'opd', 'diagnosis', 'answer_idx', 'answer', 'explanation', 'gender',
       'ethnicity'],
      dtype='object')

In [25]:
# Case of breast cancer on a man! -> TRUE
cases_without_ethnicity = df[df['ethnicity'].isna()]
case_572 = cases_without_ethnicity.iloc[571]  # 571 because indexing starts at 0
print(f"Link for Case 572: {case_572['link']}")


Link for Case 572: https://jamanetwork.com/journals/jamasurgery/fullarticle/2661292


In [28]:
# India and TB
cases_without_ethnicity = df[df['ethnicity'].isna()]
case = cases_without_ethnicity.iloc[1037] 
print(f"Link for this Case: {case['link']}")


Link for Case: https://jamanetwork.com/journals/jama/fullarticle/1911306


Here the ethnicity is completely relevant
Also, hard to properly answer the question "how do you interpret the results?" when results are not given

In [29]:
# Nigerian men -> legitimate to give all these details in this case? Dataset is biased itself?
cases_without_ethnicity = df[df['ethnicity'].isna()]
case = cases_without_ethnicity.iloc[1070] 
print(f"Link for this Case: {case['link']}")


Link for this Case: https://jamanetwork.com/journals/jama/fullarticle/1869189


### extract clinical data

In [35]:
import pandas as pd
import spacy
import re
import numpy as np

# Download the English language model
# You may need to run this once: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

def extract_ethnicity(text):
    # List of ethnicities to look for
    ethnicities = [
        "White", "Black", "Asian", "Hispanic", "Latino", "African American", 
        "Native American", "Pacific Islander", "Chinese", "Nigerian", "Mexican", 
        "Vietnamese", "Indian", "African", "Japanese", "Cambodian", "Caucasian", 
        "Ukrainian", "Costa Rican", "Bahamian", "Laotian", "West African"
    ]
    
    # Create case-insensitive versions of the ethnicities
    ethnicities = ethnicities + [e.lower() for e in ethnicities]
    
    # Use spaCy to process the text
    doc = nlp(text)
    
    # Look for ethnicity mentions
    for ent in doc.ents:
        if ent.label_ == "NORP" and ent.text in ethnicities:
            return ent.text
    
    # If no ethnicity found with spaCy, use regex to look for specific patterns
    for ethnicity in ethnicities:
        pattern = r'\b' + re.escape(ethnicity) + r'\b'
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group()
    
    # Look for more complex patterns
    patterns = [
        r'of ([\w\s]+) descent',
        r'of ([\w\s]+) ancestry',
        r'of ([\w\s]+) heritage'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match and match.group(1) in ethnicities:
            return match.group(1)
    
    return np.nan

# Assuming your DataFrame is called 'df' and the column with the case text is called 'case'
df['ethnicity'] = df['case'].apply(extract_ethnicity)


In [36]:
print(df['ethnicity'].unique())


[nan 'white' 'black' 'White' 'Chinese' 'Black' 'Asian' 'African American'
 'Nigerian' 'Hispanic' 'Mexican' 'Vietnamese' 'Indian' 'African'
 'Japanese' 'West African' 'Cambodian' 'Caucasian' 'Latino' 'Bahamian']


# Final df saving

In [42]:
df.to_csv("/Users/kenzabenkirane/Desktop/GitHub/24ucl_thesis/thesis_clinical_llm_bias/data/jama_pp.csv", index=False)
