loading required libraries

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 5000)
import json
from dotenv import load_dotenv
from src.gpt import doctor_prompt_gpt,doctor_prompt_gpt_self_confinement
from src.utils import convert_string_to_list,convert_clinical_case_summary,filterDepartment,getDepartmentStatistics,convert_cases_to_json,PDF,select_case_components
from src.ollama import doctor_prompt_ollama,doctor_prompt_ollama_self_refinement,doctor_prompt_ollama_combined
load_dotenv()



True

importing the dataset

In [2]:
filePath="dataset/clinicallab/data_en.json"
with open(filePath, 'r', encoding='utf-8') as f:
            data = json.load(f)
print("\nnumber of total cases are",len(data))
print("\neach case have the following fields",list(data[0].keys()))

keys_to_include = ["id",'clinical_department', 'principal_diagnosis', 'preliminary_diagnosis',
                   'diagnostic_basis', 'differential_diagnosis', 
                   'treatment_plan', 'clinical_case_summary', 'imageological_examination', 
                   'laboratory_examination', 'pathological_examination', 'therapeutic_principle']
df = pd.DataFrame([{key: d[key] for key in keys_to_include} for d in data])

allDepartments=df['clinical_department'].value_counts()
print("number of departments available are",len(allDepartments))

print("\n all the departments available are")
print(allDepartments)

df['preliminary_diagnosis'] = df['preliminary_diagnosis'].apply(convert_string_to_list)
df['diagnostic_basis'] = df['diagnostic_basis'].apply(convert_string_to_list)
df['differential_diagnosis'] = df['differential_diagnosis'].apply(convert_string_to_list)
df['treatment_plan'] = df['treatment_plan'].apply(convert_string_to_list)
df["clinical_case_summary"] = df["clinical_case_summary"].apply(convert_clinical_case_summary)


number of total cases are 1500

each case have the following fields ['id', 'clinical_case_uid', 'language', 'clinical_department', 'principal_diagnosis', 'preliminary_diagnosis', 'diagnostic_basis', 'differential_diagnosis', 'treatment_plan', 'clinical_case_summary', 'imageological_examination', 'laboratory_examination', 'pathological_examination', 'therapeutic_principle']
number of departments available are 24

 all the departments available are
clinical_department
orthopedics department                              100
anus and intestine surgical department              100
hepatobiliary and pancreas surgical department       99
urinary surgical department                          90
endocrinology department                             80
gynecology department                                80
otolaryngology head and neck surgical department     80
neurology department                                 80
thoracic surgical department                         70
respiratory medicine dep

filtering the dataset

In [3]:
department="gynecology department"#"respiratory medicine department"#"nephrology department"##"pediatrics department"#"gynecology department"
departmentdf=filterDepartment(df,department)
# departmentdf.to_json(f"dataset/clinicallab/department/{department}.json", orient='records', lines=False, indent=4)
getDepartmentStatistics(departmentdf)

number of principal diagnosis are 8
principal_diagnosis
ovarian cyst             10
endometrial polyp        10
endometrial cancer       10
uterine fibroid          10
adenomyosis              10
cervical polyp           10
cervical cancer          10
heterotopic pregnancy    10
Name: count, dtype: int64
number of preliminary_diagnosis are 8


In [4]:
# Function to extract disease names from a single row
def extract_disease_names_from_row(differential_diagnosis_list):
    disease=[entry.split(":")[0].strip() for entry in differential_diagnosis_list]
    return disease

# Apply the function to each row and combine all lists into one
differential_diseases = departmentdf["differential_diagnosis"].apply(extract_disease_names_from_row).sum()
refined_differential_diseases=[]
for disease in differential_diseases:
    if len(disease) <20:
        refined_differential_diseases.append(disease)
uniqueDiseases=departmentdf["principal_diagnosis"].unique().tolist()
uniquePrimary=uniqueDiseases[:]
uniqueDiseases.extend(refined_differential_diseases)
uniqueDiseases=list(set(uniqueDiseases))
print("number of unique diseases are",len(uniqueDiseases))
print(uniqueDiseases)

number of unique diseases are 46
['Vaginal Wall Mass', 'heterotopic pregnancy', 'Endometrial polyps', 'cervical cancer', 'Uterine leiomyoma', 'cervical polyp', 'Acute Salpingitis', 'Gastric cancer', 'Adenomyosis', 'Acute salpingitis', 'Endometrial Cancer', 'Uterine Adenomyoma', 'Chronic cervicitis', 'Hemolytic Anemia', 'uterine fibroid', 'Hydrosalpinx', 'Cervical lesions', 'Appendicitis', 'Ovarian tumor', 'Complete Abortion', 'Uterine Leiomyoma', 'Acute appendicitis', 'Vaginal wall mass', 'Ectopic Pregnancy', 'Acute Appendicitis', 'Endometrial cancer', 'endometrial polyp', 'Uterine Sarcoma', 'Atrophic vaginitis', 'Ovarian Tumor', 'ovarian cyst', 'Cervical cancer', 'Uterine sarcoma', 'endometrial cancer', 'Endometrial Lesions', "Meniere's Disease", 'Uterine fibroids', 'Endometrial Lesion', 'adenomyosis', 'Incomplete Abortion', 'Endometrial lesions', 'Uterine adenomyoma', 'Miscarriage', 'Cervical Lesions', 'Endometrial lesion', 'Cervical lesion']


In [5]:
required_fields=[ "Patient basic information",
                 "Chief complaint",
                 "Medical history",
                 "Physical examination",
                 "Laboratory examination",
                 "Imageological examination",
                 "Auxillary examination",
                 "Pathological examination"
    
]
departments=["respiratory medicine department",
             "nephrology department",
             "pediatrics department",
             "gynecology department",
             "endocrinology department",   
             "neurology department",
             "cardiac surgical department",                          
             "gastrointestinal surgical department" ]
# departments=["pediatrics department"]#,#"respiratory medicine department"]#"cardiac surgical department"]
models = ["llama3.1", "gemma2", "phi3:14b", "mistral-nemo"]#"mistral"
laboratory="result"
image="findings"


In [12]:
laboratory="abnormal"
image="impression"
report_type=f"{laboratory}_{image}"
for department in departments:
    print("department is",department)
    departmentdf=filterDepartment(df,department)
    caseNumbers = [i for i in range(1, len(departmentdf), 10)]
    # caseNumbers=[1]#,11,21]#31,41,51,61]
    # caseNumbers=[1,11,21,31,41,51,61]
    print(caseNumbers)
    row=departmentdf
    pdf = PDF()
    pdf.set_left_margin(10)
    pdf.set_right_margin(10)
    getDepartmentStatistics(departmentdf)
    for caseNumber in caseNumbers:        
        case_id,principal_diagnosis,differential_diagnosis,clinical_case_dict,filtered_clinical_case_dict=select_case_components(departmentdf,caseNumber,required_fields,laboratory,image)


department is respiratory medicine department
[1, 11, 21, 31, 41, 51, 61]
number of principal diagnosis are 7
principal_diagnosis
respiratory failure                      10
chronic obstructive pulmonary disease    10
bronchiectasis                           10
pulmonary embolism                       10
pulmonary tuberculosis                   10
lung abscess                             10
pulmonary infection                      10
Name: count, dtype: int64
number of preliminary_diagnosis are 7
department is nephrology department
[1, 11, 21, 31, 41, 51]
number of principal diagnosis are 6
principal_diagnosis
acute pyelonephritis       10
acute renal failure        10
chronic kidney disease     10
urinary tract infection    10
glomerulonephritis         10
nephrotic syndrome         10
Name: count, dtype: int64
number of preliminary_diagnosis are 6
department is pediatrics department
[1, 11, 21, 31, 41, 51]
number of principal diagnosis are 6
principal_diagnosis
Epstein-Barr virus inf

In [None]:
laboratory="abnormal"
image="impression"
report_type=f"{laboratory}_{image}"
for department in departments:
    print("department is",department)
    departmentdf=filterDepartment(df,department)
    caseNumbers = [i for i in range(1, len(departmentdf), 10)]
    # caseNumbers=[1]#,11,21]#31,41,51,61]
    # caseNumbers=[1,11,21,31,41,51,61]
    print(caseNumbers)
    row=departmentdf
    pdf = PDF()
    pdf.set_left_margin(10)
    pdf.set_right_margin(10)
    getDepartmentStatistics(departmentdf)
    for caseNumber in caseNumbers:
        
        case_id,principal_diagnosis,differential_diagnosis,clinical_case_dict,filtered_clinical_case_dict=select_case_components(departmentdf,caseNumber,required_fields,laboratory,image)
        print(case_id)
        print(principal_diagnosis)
        model="gpt-4"
        output=doctor_prompt_gpt(filtered_clinical_case_dict,model, differential_diagnosis, department)
        print(output)
        output2=doctor_prompt_gpt_self_confinement(filtered_clinical_case_dict,model, differential_diagnosis, department,output)
        print(output2)

In [None]:
laboratory="abnormal"
image="impression"
report_type=f"{laboratory}_{image}"
for department in departments:
    print("department is",department)
    departmentdf=filterDepartment(df,department)
    caseNumbers = [i for i in range(1, len(departmentdf), 10)]
    # caseNumbers=[1]#,11,21]#31,41,51,61]
    print(caseNumbers)
    row=departmentdf
    pdf = PDF()
    pdf.set_left_margin(10)
    pdf.set_right_margin(10)
    getDepartmentStatistics(departmentdf)
    for caseNumber in caseNumbers:
        case_id,principal_diagnosis,differential_diagnosis,clinical_case_dict,filtered_clinical_case_dict=select_case_components(departmentdf,caseNumber,required_fields,laboratory,image)
        diagnoses = [("gpt-4", doctor_prompt_gpt(filtered_clinical_case_dict, "gpt-4", uniquePrimary, department)),
                     ("llama3.1", doctor_prompt_ollama(filtered_clinical_case_dict, "llama3.1", differential_diagnosis, department)),
                     ("phi3:14b", doctor_prompt_ollama(filtered_clinical_case_dict, "phi3:14b", differential_diagnosis, department)),
                     ("mistral-nemo", doctor_prompt_ollama(filtered_clinical_case_dict, "mistral-nemo", differential_diagnosis, department))
                     ]
        # diagnoses = [(model_name, doctor_prompt_disease_restricted_ollama(filtered_clinical_case_dict, model_name, differential_diagnosis, department)) for model_name in models]
        pdf.add_case(case_id, principal_diagnosis, differential_diagnosis, clinical_case_dict, diagnoses)
        print("done for caseid",case_id)
    # # Output the PDF to a file
    pdf_file_path = f"./medical-reports/{department}_{report_type}_combined_{laboratory}_{image}.pdf"
    pdf.output(pdf_file_path)

    print(f"PDF report generated: {pdf_file_path}")