In [1]:
import pandas as pd

# consolidation of datasets

### dataset 1

https://www.kaggle.com/datasets/paultimothymooney/medical-speech-transcription-and-intent/data

In [2]:
# this is that dataset that has the audio recording, on Kaggle
df = pd.read_csv("./data/original_datasets/overview-of-recordings.csv")
df = df[["phrase", "prompt"]]
# i want it to be called phrase, label for column names
df = df.rename(columns={"phrase": "text", "prompt": "og_label"})
df.head()

Unnamed: 0,text,og_label
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound


In [3]:
df.shape

(6661, 2)

In [4]:
df["og_label"].value_counts()

og_label
Acne                  328
Shoulder pain         320
Joint pain            318
Infected wound        306
Knee pain             305
Cough                 293
Feeling dizzy         283
Muscle pain           282
Heart hurts           273
Ear ache              270
Hair falling out      264
Head ache             263
Feeling cold          263
Skin issue            262
Stomach ache          261
Back pain             259
Neck pain             251
Internal pain         248
Blurry vision         246
Body feels weak       241
Hard to breath        233
Emotional pain        231
Injury from sports    230
Foot ache             223
Open wound            208
Name: count, dtype: int64

In [5]:
# Unique phrases
print(f"Total phrases: {len(df)}")
print(f"Unique phrases: {df['text'].nunique()}")

Total phrases: 6661
Unique phrases: 706


In [6]:
# Get all phrases that appear more than once
dupe_phrases = df["text"][df["text"].duplicated(keep=False)]

# Filter the original dataframe to show only those rows
df_dupes = df[df["text"].isin(dupe_phrases)]

# Display the top rows
df_dupes.sort_values("text").head(10)

Unnamed: 0,text,og_label
1091,A terrible fall in hair,Hair falling out
5195,A terrible fall in hair,Hair falling out
4143,A terrible fall in hair,Hair falling out
1081,A terrible fall in hair,Hair falling out
1291,A terrible fall in hair,Hair falling out
4370,A terrible fall in hair,Hair falling out
4637,A terrible fall in hair,Hair falling out
4686,A terrible fall in hair,Hair falling out
192,A terrible fall in hair,Hair falling out
1019,After a breakup I feel something strange in me.,Emotional pain


There are many repeats in the dataset. Out of the original 6661 rows, there are only 706 unique rows in the patient text columns  

Dug deeper into the original Kaggle dataset. Those with same text do have different audio (different people speaking in the recording audio file), but the recordings say the same phrase

going to drop the duplicate rows

In [7]:
df_unique_text = df.drop_duplicates(subset="text", keep="first")

print(f"Original size: {len(df)}")
print(f"After keeping one row per text: {len(df_unique_text)}")
df_unique_text.head()

Original size: 6661
After keeping one row per text: 706


Unnamed: 0,text,og_label
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound


by dropping the duplicate rows, we are now left with only 706 rows in this dataset

### dataset 2

https://www.kaggle.com/datasets/niyarrbarman/symptom2disease

In [8]:
# this data comes from: https://www.kaggle.com/datasets/niyarrbarman/symptom2disease
s2d_df = pd.read_csv("./data/original_datasets/Symptom2Disease.csv")
s2d_df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [9]:
s2d_df = s2d_df[["text", "label"]]
s2d_df = s2d_df.rename(columns={"text": "text", "label": "og_label"})
s2d_df.head()

Unnamed: 0,text,og_label
0,I have been experiencing a skin rash on my arm...,Psoriasis
1,"My skin has been peeling, especially on my kne...",Psoriasis
2,I have been experiencing joint pain in my fing...,Psoriasis
3,"There is a silver like dusting on my skin, esp...",Psoriasis
4,"My nails have small dents or pits in them, and...",Psoriasis


In [10]:
s2d_df.shape

(1200, 2)

In [11]:
s2d_df["og_label"].value_counts()

og_label
Psoriasis                          50
Varicose Veins                     50
peptic ulcer disease               50
drug reaction                      50
gastroesophageal reflux disease    50
allergy                            50
urinary tract infection            50
Malaria                            50
Jaundice                           50
Cervical spondylosis               50
Migraine                           50
Hypertension                       50
Bronchial Asthma                   50
Acne                               50
Arthritis                          50
Dimorphic Hemorrhoids              50
Pneumonia                          50
Common Cold                        50
Fungal infection                   50
Dengue                             50
Impetigo                           50
Chicken pox                        50
Typhoid                            50
diabetes                           50
Name: count, dtype: int64

### dataset 3

In [12]:
# https://huggingface.co/datasets/QuyenAnhDE/Diseases_Symptoms?utm_source=chatgpt.com
ds_df = pd.read_csv("hf://datasets/QuyenAnhDE/Diseases_Symptoms/Diseases_Symptoms.csv")
ds_df.head()

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."


In [13]:
ds_df = ds_df[["Symptoms", "Name"]]
# i want it to be called phrase, label for column names
ds_df = ds_df.rename(columns={"Symptoms": "text", "Name": "og_label"})

In [14]:
ds_df.shape

(400, 2)

In [15]:
ds_df["og_label"].value_counts()

og_label
Sciatica                                 3
Complex Regional Pain Syndrome (CRPS)    2
Urinary Tract Infection (UTI)            2
Mucocele                                 2
Dermatitis due to Sun Exposure           2
                                        ..
Neonatal Jaundice                        1
Presbycusis                              1
Empyema                                  1
Tuberous Sclerosis                       1
Type 2 Diabetes                          1
Name: count, Length: 392, dtype: int64

I am not going to include dataset 3 for the time being, as it requires me to do a lot research on these condition names, with not a lot of pay off as far as increasing our data size (only adds 400 rows)

## bringing datasets together

In [16]:
bigdata = pd.concat([df_unique_text, s2d_df, ds_df])
bigdata.head()

Unnamed: 0,text,og_label
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound


In [17]:
bigdata.shape

(2306, 2)

In [18]:
bigdata["og_label"].value_counts()

og_label
Acne                  84
Psoriasis             51
Pneumonia             51
Varicose Veins        51
Migraine              51
                      ..
Empyema                1
Tuberous Sclerosis     1
Astigmatism            1
Acute Otitis Media     1
Type 2 Diabetes        1
Name: count, Length: 435, dtype: int64

In [19]:
bigdata["LABEL"] = None
bigdata.head()

Unnamed: 0,text,og_label,LABEL
0,When I remember her I feel down,Emotional pain,
1,When I carry heavy things I feel like breaking...,Hair falling out,
2,there is too much pain when i move my arm,Heart hurts,
3,My son had his lip pierced and it is swollen a...,Infected wound,
4,My muscles in my lower back are aching,Infected wound,


dermatology
    acne
    hair falling out
    skin issue
    Psoriasis
    Acne
    Dermatitis due to Sun Exposure  

psychological
    Emotional pain

diabetes

body pain
    Shoulder pain
    Joint pain
    Knee pain
    Muscle pain
    Back pain
    Neck pain
    Foot ache

gynecology/urology
    Urinary Tract Infection (UTI)
    urinary tract infection 


In [27]:
medical_condition_mapping = {
    "Infections": [
        "Infected wound",
        "Open wound",
        "Fungal infection",
        "Pneumonia",
        "Malaria",
        "Dengue",
        "Typhoid",
        "urinary tract infection",
        "Urinary Tract Infection (UTI)",
        "Empyema",
        "Acute Otitis Media",
        "Chancroid",
        "Cold Sore",
        "Herpangina",
        "Pharyngitis",
        "Infection of Open Wound",
        "Abscess of Nose",
        "Cryptococcosis",
    ],
    "Dermatological & Skin Conditions": [
        "Acne",
        "Skin issue",
        "Hair falling out",
        "Psoriasis",
        "Dimorphic Hemorrhoids",
        "Impetigo",
        "Chicken pox",
        "Dermatitis due to Sun Exposure",
        "Mucocele",
        "Actinic Keratosis",
    ],
    "Chronic Conditions": [
        "Hypertension",
        "diabetes",
        "Arthritis",
        "Cervical spondylosis",
        "Varicose Veins",
        "Heart hurts",
        "Fibromyalgia",
        "Tuberous Sclerosis",
        "Type 2 Diabetes",
        "Endometriosis",
        "Hypocalcemia",
        "Spondylosis",
        "Type 1 Diabetes",
        "Chronic Rheumatic Fever",
        "Juvenile Rheumatoid Arthritis",
        "Rheumatoid Arthritis",
        "Turner syndrome",
        "Testicular Cancer",
        "Premature Ventricular Cancer (PVCs)",
        "Protein Deficiency",
        "Esophageal Cancer",
        "Osteoporosis",
        "Cryptorchidism",
        "Leukemia",
        "Hydrocephalus",
        "Premature Ventricular Contractions (PVCs)",
        "Thyroid Disease",
        "Anemia due to Malignancy",
        "Breast Cancer",
        "Osteoarthritis",
        "Von Willebrand Disease",
        "Obesity",
    ],
    "Respiratory & Sensory Issues": [
        "Cough",
        "Feeling cold",
        "Hard to breath",
        "Bronchial Asthma",
        "Common Cold",
        "Ear ache",
        "Blurry vision",
        "Presbycusis",
        "Astigmatism",
        "Vocal cord polyp",
        "Macular Degeneration",
        "Pulmonary Congestion",
    ],
    "Allergic/Immunologic Reactions": [
        "allergy",
        "drug reaction",
        "Conjunctivitis due to Allergy",
    ],
    "Neurological & General Symptoms": [
        "Migraine",
        "Feeling dizzy",
        "Emotional pain",
        "Body feels weak",
        "Panic disorder",
        "Poisoning due to Antidepressants",
        "Tension Headache",
        "Myoclonus",
        "Ethylene glycol poisoning",
    ],
    "Pain & Injuries": [
        "Shoulder pain",
        "Joint pain",
        "Knee pain",
        "Muscle pain",
        "Back pain",
        "Neck pain",
        "Foot ache",
        "Head ache",
        "Internal pain",
        "Injury from sports",
        "Sciatica",
        "Complex Regional Pain Syndrome (CRPS)",
        "Dislocation of the Elbow",
        "Bone Spur of the Calcaneus",
        "Bursitis",
        "Urinary Stones (Kidney Stones)",
    ],
    "Gastrointestinal & Hepatobiliary Conditions": [
        "Stomach ache",
        "peptic ulcer disease",
        "gastroesophageal reflux disease",
        "Jaundice",
        "Neonatal Jaundice",
        "Indigestion",
        "Gastroenteritis (Stomach Flu)",
    ],
}

In [28]:
# Flatten the medical_condition_mapping into a condition-to-category mapping
condition_to_category = {
    condition: category
    for category, conditions in medical_condition_mapping.items()
    for condition in conditions
}

# Map the categories to the 'og_label' column
bigdata["LABEL"] = bigdata["og_label"].map(condition_to_category)

In [29]:
# see which ones I havent mapped yet
bigdata[bigdata["LABEL"].isna()]["og_label"].value_counts()

og_label
Ethylene glycol poisoning-1    1
Cystic Fibrosis                1
Esophageal Varices             1
Myopia                         1
Adrenal Cancer                 1
                              ..
Colonic Polyp                  1
Heart Block                    1
Marijuana Abuse                1
Abdominal Hernia               1
Bronchitis                     1
Name: count, Length: 330, dtype: int64

In [30]:
bigdata["LABEL"].value_counts()

LABEL
Infections                                     368
Dermatological & Skin Conditions               346
Chronic Conditions                             310
Pain & Injuries                                298
Respiratory & Sensory Issues                   242
Gastrointestinal & Hepatobiliary Conditions    180
Neurological & General Symptoms                131
Allergic/Immunologic Reactions                 101
Name: count, dtype: int64

ensure no data leakage

reduce num labels

if still weird, 

F1 score (Pr) see whats going on, will help see clas imbalance

In [32]:
bigdata.head(20)

Unnamed: 0,text,og_label,LABEL
0,When I remember her I feel down,Emotional pain,Neurological & General Symptoms
1,When I carry heavy things I feel like breaking...,Hair falling out,Dermatological & Skin Conditions
2,there is too much pain when i move my arm,Heart hurts,Chronic Conditions
3,My son had his lip pierced and it is swollen a...,Infected wound,Infections
4,My muscles in my lower back are aching,Infected wound,Infections
5,i have muscle pain that my back\nI Have Muscle...,Foot ache,Pain & Injuries
6,I have muscle pain in my left leg,Shoulder pain,Pain & Injuries
7,I have cut my finger because of playing footba...,Injury from sports,Pain & Injuries
8,I have acne in my face and other problems in m...,Skin issue,Dermatological & Skin Conditions
9,I have a strange rash on my arm,Foot ache,Pain & Injuries


In [31]:
bigdata.shape

(2306, 3)

In [26]:
bigdata.to_csv("./data/final_data.csv", index=False)