loading required libraries

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 5000)
import json
from dotenv import load_dotenv
from src.gpt import doctor_prompt_disease_restricted_gpt
from src.utils import convert_string_to_list,convert_clinical_case_summary,filterDepartment,getDepartmentStatistics,convert_cases_to_json,PDF
from src.ollama import doctor_prompt_disease_restricted_ollama,doctor_prompt_disease_restricted_ollama_combined
load_dotenv()



True

importing the dataset

In [2]:
filePath="dataset/clinicallab/data_en.json"
with open(filePath, 'r', encoding='utf-8') as f:
            data = json.load(f)
print("number of total cases are",len(data))
print("each case have the following fields",list(data[0].keys()))

number of total cases are 1500
each case have the following fields ['id', 'clinical_case_uid', 'language', 'clinical_department', 'principal_diagnosis', 'preliminary_diagnosis', 'diagnostic_basis', 'differential_diagnosis', 'treatment_plan', 'clinical_case_summary', 'imageological_examination', 'laboratory_examination', 'pathological_examination', 'therapeutic_principle']


filtering the dataset

In [3]:

keys_to_include = ["id",'clinical_department', 'principal_diagnosis', 'preliminary_diagnosis',
                   'diagnostic_basis', 'differential_diagnosis', 
                   'treatment_plan', 'clinical_case_summary', 'imageological_examination', 
                   'laboratory_examination', 'pathological_examination', 'therapeutic_principle']
df = pd.DataFrame([{key: d[key] for key in keys_to_include} for d in data])

In [4]:
allDepartments=df['clinical_department'].value_counts()
print("number of departments available are",len(allDepartments))

print(allDepartments)

df['preliminary_diagnosis'] = df['preliminary_diagnosis'].apply(convert_string_to_list)
df['diagnostic_basis'] = df['diagnostic_basis'].apply(convert_string_to_list)
df['differential_diagnosis'] = df['differential_diagnosis'].apply(convert_string_to_list)
df['treatment_plan'] = df['treatment_plan'].apply(convert_string_to_list)
df["clinical_case_summary"] = df["clinical_case_summary"].apply(convert_clinical_case_summary)

number of departments available are 24
clinical_department
orthopedics department                              100
anus and intestine surgical department              100
hepatobiliary and pancreas surgical department       99
urinary surgical department                          90
endocrinology department                             80
gynecology department                                80
otolaryngology head and neck surgical department     80
neurology department                                 80
thoracic surgical department                         70
respiratory medicine department                      70
gastroenterology department                          70
neurosurgery department                              70
cardiac surgical department                          70
nephrology department                                60
gastrointestinal surgical department                 60
pediatrics department                                60
thyroid surgical department                  

In [5]:
department="gynecology department"#"respiratory medicine department"#"nephrology department"##"pediatrics department"#"gynecology department"
departmentdf=filterDepartment(df,department)
departmentdf.to_json(f"dataset/clinicallab/department/{department}.json", orient='records', lines=False, indent=4)

In [6]:
getDepartmentStatistics(departmentdf)

number of principal diagnosis are 8
principal_diagnosis
ovarian cyst             10
endometrial polyp        10
endometrial cancer       10
uterine fibroid          10
adenomyosis              10
cervical polyp           10
cervical cancer          10
heterotopic pregnancy    10
Name: count, dtype: int64
number of preliminary_diagnosis are 8


In [7]:
# Function to extract disease names from a single row
def extract_disease_names_from_row(differential_diagnosis_list):
    disease=[entry.split(":")[0].strip() for entry in differential_diagnosis_list]
    return disease

# Apply the function to each row and combine all lists into one
differential_diseases = departmentdf["differential_diagnosis"].apply(extract_disease_names_from_row).sum()
refined_differential_diseases=[]
for disease in differential_diseases:
    if len(disease) <20:
        refined_differential_diseases.append(disease)
uniqueDiseases=departmentdf["principal_diagnosis"].unique().tolist()
uniquePrimary=uniqueDiseases[:]
uniqueDiseases.extend(refined_differential_diseases)
uniqueDiseases=list(set(uniqueDiseases))
print("number of unique diseases are",len(uniqueDiseases))
print(uniqueDiseases)

number of unique diseases are 46
['Cervical Lesions', 'endometrial cancer', 'Uterine Sarcoma', "Meniere's Disease", 'Cervical lesions', 'Endometrial Cancer', 'Uterine fibroids', 'Incomplete Abortion', 'Uterine Adenomyoma', 'Acute Appendicitis', 'Hemolytic Anemia', 'uterine fibroid', 'Atrophic vaginitis', 'endometrial polyp', 'Acute appendicitis', 'Ectopic Pregnancy', 'ovarian cyst', 'adenomyosis', 'Acute salpingitis', 'Endometrial Lesion', 'Endometrial polyps', 'Vaginal wall mass', 'Miscarriage', 'Chronic cervicitis', 'Gastric cancer', 'Endometrial lesions', 'Complete Abortion', 'Endometrial Lesions', 'cervical polyp', 'Acute Salpingitis', 'heterotopic pregnancy', 'Uterine leiomyoma', 'Ovarian tumor', 'Endometrial lesion', 'Ovarian Tumor', 'Adenomyosis', 'Cervical cancer', 'Appendicitis', 'Vaginal Wall Mass', 'Endometrial cancer', 'Uterine sarcoma', 'cervical cancer', 'Cervical lesion', 'Uterine adenomyoma', 'Hydrosalpinx', 'Uterine Leiomyoma']


In [8]:
departmentdf.columns

Index(['id', 'clinical_department', 'principal_diagnosis',
       'preliminary_diagnosis', 'diagnostic_basis', 'differential_diagnosis',
       'treatment_plan', 'clinical_case_summary', 'imageological_examination',
       'laboratory_examination', 'pathological_examination',
       'therapeutic_principle'],
      dtype='object')

In [9]:
def select_case_components(departmentdf,rowNumber,required_fields):
    row = departmentdf.iloc[rowNumber]
    case_id = row.id
    clinical_case = row.clinical_case_summary
    principal_diagnosis = row.principal_diagnosis
    differential_diagnosis = row.differential_diagnosis
    clinical_case_dict={
    "Patient basic information":clinical_case['Patient Basic Information'],
    "Chief complaint" : clinical_case['Chief Complaint'],
    "Medical history" : clinical_case['Medical History'],
    "Physical examination" : clinical_case['Physical Examination'],
    "Laboratory examination" : row.laboratory_examination,
    "Imageological examination" : row.imageological_examination,
    "Auxillary examination": clinical_case['Auxiliary Examination'],
    "Pathological examination" : row.pathological_examination
    }
    filtered_clinical_case_dict={}
    for key in required_fields:
        filtered_clinical_case_dict[key]=clinical_case_dict[key]
    return case_id,principal_diagnosis,differential_diagnosis,clinical_case_dict,filtered_clinical_case_dict

In [None]:
required_fields=[ "Patient basic information",
                 "Chief complaint",
                 "Medical history",
                 "Physical examination",
                 "Laboratory examination",
                 "Imageological examination",
                 "Auxillary examination",
                 "Pathological examination"
    
]
departments=["respiratory medicine department",
             "nephrology department",
             "pediatrics department",
             "gynecology department",
             "endocrinology department",
             "gynecology department",    
             "neurology department",
             "cardiac surgical department",                          
             "gastrointestinal surgical department" ]
models = ["llama3.1", "mistral", "gemma2", "phi3:14b", "mistral-nemo"]


In [10]:

for department in departments:
    print("department is",department)
    departmentdf=filterDepartment(df,department)
    print(len(departmentdf))
    caseNumbers = [i for i in range(1, len(departmentdf), 10)]
    caseNumbers=[1]#,11,21]#31,41,51,61]
    print(caseNumbers)
    differential_diseases = departmentdf["differential_diagnosis"].apply(extract_disease_names_from_row).sum()
    refined_differential_diseases=[]
    for disease in differential_diseases:
        if len(disease) <20:
            refined_differential_diseases.append(disease)
    uniqueDiseases=departmentdf["principal_diagnosis"].unique().tolist()
    uniquePrimary=uniqueDiseases[:]
    uniqueDiseases.extend(refined_differential_diseases)
    uniqueDiseases=list(set(uniqueDiseases))
    
    pdf = PDF()
    pdf.set_left_margin(10)
    pdf.set_right_margin(10)
    getDepartmentStatistics(departmentdf)
    for caseNumber in caseNumbers:
        print(caseNumber)
        case_id,principal_diagnosis,differential_diagnosis,clinical_case_dict,filtered_clinical_case_dict=select_case_components(departmentdf,caseNumber,required_fields)
         # ("gpt-4", doctor_prompt_disease_restricted_gpt(filtered_clinical_case_dict, "gpt-4", uniquePrimary, department)),
        diagnoses = [(model_name, doctor_prompt_disease_restricted_ollama(filtered_clinical_case_dict, model_name, uniquePrimary, department)) for model_name in models]

        
        print("done diagnosing")
        pdf.add_case(case_id, principal_diagnosis, differential_diagnosis, clinical_case_dict, diagnoses)

    # Output the PDF to a file
    pdf_file_path = f"./medical-reports/medical_case_report_{department}.pdf"
    pdf.output(pdf_file_path)

    print(f"PDF report generated: {pdf_file_path}")

department is gynecology department
80
[1]
number of principal diagnosis are 8
principal_diagnosis
ovarian cyst             10
endometrial polyp        10
endometrial cancer       10
uterine fibroid          10
adenomyosis              10
cervical polyp           10
cervical cancer          10
heterotopic pregnancy    10
Name: count, dtype: int64
number of preliminary_diagnosis are 8
1
started model  llama3.1
done for model llama3.1
started model  mistral
done for model mistral
started model  gemma2
done for model gemma2
started model  phi3:14b
done for model phi3:14b
started model  mistral-nemo
done for model mistral-nemo
done diagnosing
PDF report generated: ./medical-reports/medical_case_report_gynecology department.pdf
