In [1]:
import pandas as pd
from io import StringIO

# 100-row medical symptom dataset
data = """
Disease,Symptom,Description
Common Cold,"['runny nose', 'sore throat', 'cough', 'sneezing']","Viral infection of upper respiratory tract"
Influenza,"['fever', 'body aches', 'fatigue', 'headache']","Viral infection causing systemic symptoms"
Allergies,"['sneezing', 'itchy eyes', 'nasal congestion']","Immune response to environmental triggers"
Migraine,"['throbbing headache', 'nausea', 'light sensitivity']","Neurological condition with severe headaches"
Gastroenteritis,"['diarrhea', 'nausea', 'stomach cramps']","Inflammation of stomach and intestines"
Bronchitis,"['persistent cough', 'chest congestion', 'fatigue']","Inflammation of bronchial tubes"
Sinusitis,"['facial pain', 'nasal congestion', 'headache']","Inflammation of sinus cavities"
Strep Throat,"['sore throat', 'fever', 'swollen lymph nodes']","Bacterial infection of the throat"
UTI,"['burning urination', 'frequent urination', 'pelvic pain']","Urinary tract infection"
Pneumonia,"['cough', 'fever', 'shortness of breath']","Lung infection causing inflammation"
Common Cold,"['congestion', 'mild fever', 'sore throat']","Mild viral infection affecting nose and throat"
Influenza,"['high fever', 'chills', 'muscle pain']","Acute respiratory illness with systemic effects"
Allergies,"['watery eyes', 'sneezing fits', 'postnasal drip']","Hypersensitivity reaction to allergens"
Migraine,"['aura', 'vomiting', 'sound sensitivity']","Severe headache disorder with sensory disturbances"
Gastroenteritis,"['vomiting', 'dehydration', 'loss of appetite']","Acute inflammation of gastrointestinal tract"
Bronchitis,"['wheezing', 'shortness of breath', 'mucus production']","Lower respiratory tract inflammation"
Sinusitis,"['toothache', 'bad breath', 'green nasal discharge']","Bacterial or viral sinus infection"
Strep Throat,"['white patches on throat', 'difficulty swallowing']","Streptococcal pharyngitis infection"
UTI,"['cloudy urine', 'strong urine odor', 'pelvic pressure']","Bacterial infection in urinary system"
Pneumonia,"['chest pain', 'sweating', 'rapid breathing']","Alveolar inflammation in lungs"
Common Cold,"['hoarse voice', 'mild cough', 'low energy']","Upper respiratory viral infection"
Influenza,"['extreme fatigue', 'dry cough', 'loss of appetite']","Seasonal flu with severe exhaustion"
Allergies,"['itchy throat', 'ear congestion', 'dark circles under eyes']","Chronic allergic response"
Migraine,"['visual disturbances', 'dizziness', 'neck pain']","Neurological headache with aura symptoms"
Gastroenteritis,"['bloating', 'gas', 'low-grade fever']","Stomach and intestinal inflammation"
Bronchitis,"['chest tightness', 'slight fever', 'wheezing']","Acute bronchial tube inflammation"
Sinusitis,"['ear pressure', 'reduced smell', 'jaw pain']","Chronic sinus cavity inflammation"
Strep Throat,"['red spots on roof of mouth', 'headache']","Streptococcal bacterial infection"
UTI,"['lower back pain', 'urge to urinate', 'blood in urine']","Possible kidney involvement"
Pneumonia,"['confusion (in elderly)', 'bluish lips', 'rapid pulse']","Severe lung infection with hypoxia"
Common Cold,"['slight body aches', 'watery eyes', 'tickle in throat']","Mild upper respiratory infection"
Influenza,"['stuffy nose', 'sore throat', 'eye pain']","Viral respiratory illness with systemic effects"
Allergies,"['itchy ears', 'scratchy throat', 'recurrent sneezing']","Seasonal allergic rhinitis"
Migraine,"['one-sided pain', 'worsened by movement', 'irritability']","Unilateral headache disorder"
Gastroenteritis,"['general weakness', 'muscle aches', 'mild headache']","Viral stomach flu with dehydration risk"
Bronchitis,"['hacking cough', 'soreness behind sternum', 'chills']","Acute viral bronchitis"
Sinusitis,"['pressure behind eyes', 'tooth sensitivity', 'ear fullness']","Acute bacterial sinusitis"
Strep Throat,"['sandpaper-like rash', 'stomach pain (in children)']","Scarlet fever manifestation"
UTI,"['pelvic discomfort', 'urinating small amounts']","Cystitis or lower UTI"
Pneumonia,"['shaking chills', 'nausea', 'diarrhea (in some cases)']","Lobar pneumonia presentation"
Common Cold,"['post-nasal drip', 'mild fatigue', 'occasional cough']","Post-viral upper respiratory symptoms"
Influenza,"['sudden onset fever', 'weakness', 'loss of taste']","Acute influenza infection"
Allergies,"['itchy palate', 'clear nasal discharge', 'cough']","Perennial allergic rhinitis"
Migraine,"['prodrome symptoms', 'yawning', 'food cravings']","Premonitory phase of migraine"
Gastroenteritis,"['abdominal tenderness', 'general malaise']","Infectious gastroenteritis"
Bronchitis,"['rattling sensation in chest', 'hoarseness']","Chronic bronchitis symptoms"
Sinusitis,"['thick yellow mucus', 'cough worse at night']","Purulent sinus infection"
Strep Throat,"['swollen tonsils', 'petechiae on palate']","Acute streptococcal tonsillitis"
UTI,"['feeling of incomplete emptying', 'pressure in lower abdomen']","Urinary retention symptoms"
Pneumonia,"['rust-colored sputum', 'mental confusion']","Pneumococcal pneumonia signs"
Common Cold,"['tickle in chest', 'mild ear pressure']","Post-viral cough symptoms"
Influenza,"['pain behind eyes', 'extreme exhaustion']","Severe influenza manifestation"
Allergies,"['itchy skin', 'recurrent throat clearing']","Atopic allergic response"
Migraine,"['neck stiffness', 'difficulty concentrating']","Cervicogenic migraine features"
Gastroenteritis,"['gurgling stomach', 'decreased urine output']","Dehydrating gastroenteritis"
Bronchitis,"['painful coughing', 'sore ribs from coughing']","Persistent bronchial inflammation"
Sinusitis,"['facial tenderness', 'blocked ears']","Acute maxillary sinusitis"
Strep Throat,"['abdominal pain in children', 'sandpaper rash']","Pediatric streptococcal infection"
UTI,"['fever (if kidney infection)', 'flank pain']","Pyelonephritis symptoms"
Pneumonia,"['grunting sounds when breathing', 'nasal flaring']","Severe respiratory distress"
Common Cold,"['mild swollen glands', 'scratchy throat']","Lymph node response to infection"
Influenza,"['sweating', 'shivering alternately']","Febrile influenza symptoms"
Allergies,"['itchy roof of mouth', 'snoring from congestion']","Chronic allergic congestion"
Migraine,"['light flashes', 'zigzag patterns in vision']","Migraine with visual aura"
Gastroenteritis,"['acidic taste in mouth', 'excessive thirst']","Gastroesophageal reflux symptoms"
Bronchitis,"['breathlessness', 'tight feeling in chest']","Obstructive airway symptoms"
Sinusitis,"['mucus dripping down throat', 'ear popping']","Chronic postnasal drip"
Strep Throat,"['difficulty opening mouth', 'red swollen tonsils']","Acute tonsillar inflammation"
UTI,"['foul-smelling urine', 'pelvic heaviness']","Bacterial urinary infection"
Pneumonia,"['shallow breathing', 'loss of appetite']","Atypical pneumonia presentation"
Common Cold,"['mild hoarseness', 'occasional sneezing']","Laryngitis with cold symptoms"
Influenza,"['joint pain', 'sensitivity to light']","Systemic influenza symptoms"
Allergies,"['itchy inner ears', 'recurrent sinus infections']","Chronic allergic sinusitis"
Migraine,"['tingling in face/hands', 'slurred speech']","Complex migraine with aura"
Gastroenteritis,"['pale skin', 'sunken eyes (in severe cases)']","Severe dehydration signs"
Bronchitis,"['blue-tinged lips (in severe cases)', 'fever over 100.4F']","Hypoxic respiratory distress"
Sinusitis,"['reduced sense of taste', 'earache']","Olfactory dysfunction in sinusitis"
Strep Throat,"['tiny red spots on tongue', 'strawberry tongue']","Scarlet fever tongue signs"
UTI,"['pain during intercourse', 'general unease']","Chronic urinary symptoms"
Pneumonia,"['clammy skin', 'fatigue disproportionate to activity']","Severe systemic infection"
"""

# Create DataFrame
df = pd.read_csv(StringIO(data))

# Save to CSV
df.to_csv('medical_symptoms_100.csv', index=False)
print("Dataset with 100 rows created as 'medical_symptoms_100.csv'")

Dataset with 100 rows created as 'medical_symptoms_100.csv'


In [59]:
import pandas as pd
data=pd.read_csv('medical_symptoms_100.csv')

In [4]:
data.head()

Unnamed: 0,Disease,Symptom,Description
0,Common Cold,"['runny nose', 'sore throat', 'cough', 'sneezi...",Viral infection of upper respiratory tract
1,Influenza,"['fever', 'body aches', 'fatigue', 'headache']",Viral infection causing systemic symptoms
2,Allergies,"['sneezing', 'itchy eyes', 'nasal congestion']",Immune response to environmental triggers
3,Migraine,"['throbbing headache', 'nausea', 'light sensit...",Neurological condition with severe headaches
4,Gastroenteritis,"['diarrhea', 'nausea', 'stomach cramps']",Inflammation of stomach and intestines


# Bag of Words

In [45]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Extract required columns
disease_symptoms_map = data.groupby('Disease')['Symptom'].apply(lambda x: ' '.join(x)).to_dict()
disease_names = list(disease_symptoms_map.keys())
disease_symptoms_list = list(disease_symptoms_map.values())

# Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(disease_symptoms_list)  # Train on the entire dataset

# User input testing function
def test_bow_model():
    user_input = input("Enter symptoms: ")
    input_vector = vectorizer.transform([user_input])
    
    similarities = cosine_similarity(input_vector, X_bow)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order
    
    print("\nDiseases with similar symptoms:")
    for idx in sorted_indices[:5]:  # Show top 5 similar diseases
        if similarities[idx] > 0:
            print(f"{disease_names[idx]} (Similarity: {similarities[idx]:.4f})")
    
    most_similar_index = sorted_indices[0]
    predicted_disease = disease_names[most_similar_index]
    
    print(f"\nPredicted Disease (BoW): {predicted_disease}")
    print(f"Explanation: The input symptoms were most similar to the symptoms of {predicted_disease} based on BoW vectorization, which counts word occurrences.")
    
    # Fetch and print all symptoms associated with the predicted disease
    all_symptoms = disease_symptoms_map[predicted_disease]
    print(f"\nAll Symptoms of {predicted_disease}: {all_symptoms}")

# Test BoW model
test_bow_model()

Enter symptoms:  Pounding sensation on one side, bright lights unbearable



Diseases with similar symptoms:
Strep Throat (Similarity: 0.2357)
Migraine (Similarity: 0.0801)
Bronchitis (Similarity: 0.0671)

Predicted Disease (BoW): Strep Throat
Explanation: The input symptoms were most similar to the symptoms of Strep Throat based on BoW vectorization, which counts word occurrences.

All Symptoms of Strep Throat: ['sore throat', 'fever', 'swollen lymph nodes'] ['white patches on throat', 'difficulty swallowing'] ['red spots on roof of mouth', 'headache'] ['sandpaper-like rash', 'stomach pain (in children)'] ['swollen tonsils', 'petechiae on palate'] ['abdominal pain in children', 'sandpaper rash'] ['difficulty opening mouth', 'red swollen tonsils'] ['tiny red spots on tongue', 'strawberry tongue']


# TF-IDF

In [44]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Extract required columns
disease_symptoms_map = data.groupby('Disease')['Symptom'].apply(lambda x: ' '.join(x)).to_dict()
disease_names = list(disease_symptoms_map.keys())
disease_symptoms_list = list(disease_symptoms_map.values())

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(disease_symptoms_list)  # Train on the entire dataset

# User input testing function
def test_tfidf_model():
    user_input = input("Enter symptoms: ")
    input_vector = vectorizer.transform([user_input])
    similarities = cosine_similarity(input_vector, X_tfidf)[0]

    sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order

    print("\nDiseases with similar symptoms:")
    for idx in sorted_indices[:5]:  # Show top 5 similar diseases
        if similarities[idx] > 0:
            print(f"{disease_names[idx]} (Similarity: {similarities[idx]:.4f})")

    most_similar_index = sorted_indices[0]
    predicted_disease = disease_names[most_similar_index]

    print(f"\nPredicted Disease (TF-IDF): {predicted_disease}")
    print(f"Explanation: The input symptoms were most similar to the symptoms of {predicted_disease} based on TF-IDF vectorization, which weighs important words more.")
    
    # Fetch and print all symptoms associated with the predicted disease
    all_symptoms = disease_Asymptoms_map[predicted_disease]
    print(f"\nAll Symptoms of {predicted_disease}: {all_symptoms}")

# Test TF-IDF model
test_tfidf_model()

Enter symptoms:  Pounding sensation on one side, bright lights unbearable



Diseases with similar symptoms:
Strep Throat (Similarity: 0.2611)
Migraine (Similarity: 0.0896)
Bronchitis (Similarity: 0.0837)

Predicted Disease (TF-IDF): Strep Throat
Explanation: The input symptoms were most similar to the symptoms of Strep Throat based on TF-IDF vectorization, which weighs important words more.

All Symptoms of Strep Throat: ['sore throat', 'fever', 'swollen lymph nodes'] ['white patches on throat', 'difficulty swallowing'] ['red spots on roof of mouth', 'headache'] ['sandpaper-like rash', 'stomach pain (in children)'] ['swollen tonsils', 'petechiae on palate'] ['abdominal pain in children', 'sandpaper rash'] ['difficulty opening mouth', 'red swollen tonsils'] ['tiny red spots on tongue', 'strawberry tongue']


# Word2Vec

In [55]:
import pandas as pd
import numpy as np
import ast
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Convert string representation of lists into actual lists
data['Symptom'] = data['Symptom'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Train Word2Vec model
symptom_sentences = data['Symptom'].tolist()
word2vec_model = Word2Vec(sentences=symptom_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to get sentence embeddings
def get_sentence_embedding(words):
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(word2vec_model.vector_size)

# Group symptoms by disease
disease_symptoms_map = data.groupby('Disease')['Symptom'].apply(lambda x: sum(x.tolist(), [])).to_dict()
disease_names = list(disease_symptoms_map.keys())
disease_embeddings = [get_sentence_embedding(symptoms) for symptoms in disease_symptoms_map.values()]

# User input testing function
def test_word2vec_model():
    user_input = input("Enter symptoms (comma-separated): ")
    input_symptoms = [sym.strip() for sym in user_input.split(',')]

    input_vector = get_sentence_embedding(input_symptoms)
    similarities = cosine_similarity([input_vector], disease_embeddings)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order

    print("\nDiseases with similar symptoms:")
    for idx in sorted_indices[:5]:  # Show top 5 similar diseases
        if similarities[idx] > 0:
            print(f"{disease_names[idx]} (Similarity: {similarities[idx]:.4f})")

    most_similar_index = sorted_indices[0]
    predicted_disease = disease_names[most_similar_index]

    print(f"\nPredicted Disease (Word2Vec): {predicted_disease}")
    print(f"Explanation: The input symptoms were most similar to the symptoms of {predicted_disease} based on Word2Vec embeddings, which capture semantic relationships.")

    # Fetch and print all symptoms associated with the predicted disease
    all_symptoms = ', '.join(disease_symptoms_map[predicted_disease])
    print(f"\nAll Symptoms of {predicted_disease}: {all_symptoms}")

# Test Word2Vec model
test_word2vec_model()

Enter symptoms (comma-separated):  Pounding sensation on one side, bright lights unbearable



Diseases with similar symptoms:

Predicted Disease (Word2Vec): UTI
Explanation: The input symptoms were most similar to the symptoms of UTI based on Word2Vec embeddings, which capture semantic relationships.

All Symptoms of UTI: burning urination, frequent urination, pelvic pain, cloudy urine, strong urine odor, pelvic pressure, lower back pain, urge to urinate, blood in urine, pelvic discomfort, urinating small amounts, feeling of incomplete emptying, pressure in lower abdomen, fever (if kidney infection), flank pain, foul-smelling urine, pelvic heaviness, pain during intercourse, general unease


In [60]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Extract required columns
disease_symptoms_map = data.groupby('Disease')['Symptom'].apply(lambda x: ' '.join(x)).to_dict()
disease_names = list(disease_symptoms_map.keys())
disease_symptoms_list = list(disease_symptoms_map.values())

# Tokenize the symptoms for Word2Vec
tokenized_symptoms = [symptoms.split() for symptoms in disease_symptoms_list]

# Train a Word2Vec model on the tokenized symptoms
word2vec_model = Word2Vec(sentences=tokenized_symptoms, vector_size=100, window=5, min_count=1, workers=4)

# Function to compute the average vector for a sentence/document
def get_sentence_vector(sentence, model):
    tokens = sentence.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Generate document vectors for all diseases using Word2Vec
X_word2vec = np.array([get_sentence_vector(symptoms, word2vec_model) for symptoms in disease_symptoms_list])

# User input testing function
def test_word2vec_model():
    user_input = input("Enter symptoms (separated by spaces): ")
    input_vector = get_sentence_vector(user_input, word2vec_model).reshape(1, -1)
    
    # Compute cosine similarities between the input and all disease vectors
    similarities = cosine_similarity(input_vector, X_word2vec)[0]

    sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order

    print("\nDiseases with similar symptoms:")
    for idx in sorted_indices[:5]:  # Show top 5 similar diseases
        if similarities[idx] > 0:
            print(f"{disease_names[idx]} (Similarity: {similarities[idx]:.4f})")

    most_similar_index = sorted_indices[0]
    predicted_disease = disease_names[most_similar_index]

    print(f"\nPredicted Disease (Word2Vec): {predicted_disease}")
    print(f"Explanation: The input symptoms were most similar to the symptoms of {predicted_disease} based on Word2Vec vectorization, which captures semantic relationships between words.")
    
    # Fetch and print all symptoms associated with the predicted disease
    all_symptoms = disease_symptoms_map[predicted_disease]
    print(f"\nAll Symptoms of {predicted_disease}: {all_symptoms}")

# Test Word2Vec model
test_word2vec_model()

Enter symptoms (separated by spaces):  Pounding sensation on one side, bright lights unbearable



Diseases with similar symptoms:
Strep Throat (Similarity: 0.5470)
Sinusitis (Similarity: 0.0869)
Influenza (Similarity: 0.0262)
Pneumonia (Similarity: 0.0131)

Predicted Disease (Word2Vec): Strep Throat
Explanation: The input symptoms were most similar to the symptoms of Strep Throat based on Word2Vec vectorization, which captures semantic relationships between words.

All Symptoms of Strep Throat: ['sore throat', 'fever', 'swollen lymph nodes'] ['white patches on throat', 'difficulty swallowing'] ['red spots on roof of mouth', 'headache'] ['sandpaper-like rash', 'stomach pain (in children)'] ['swollen tonsils', 'petechiae on palate'] ['abdominal pain in children', 'sandpaper rash'] ['difficulty opening mouth', 'red swollen tonsils'] ['tiny red spots on tongue', 'strawberry tongue']


# FastText

In [47]:
import pandas as pd
import numpy as np
import ast
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity

# Convert string representation of lists into actual lists
data['Symptom'] = data['Symptom'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Train FastText model
symptom_sentences = data['Symptom'].tolist()
fasttext_model = FastText(sentences=symptom_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to compute sentence embedding using FastText
def get_sentence_embedding(words):
    vectors = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(fasttext_model.vector_size)

# Group symptoms by disease
disease_symptoms_map = data.groupby('Disease')['Symptom'].apply(lambda x: sum(x.tolist(), [])).to_dict()
disease_names = list(disease_symptoms_map.keys())
disease_embeddings = np.array([get_sentence_embedding(symptoms) for symptoms in disease_symptoms_map.values()])

# User input testing function
def test_fasttext_model():
    user_input = input("Enter symptoms (comma-separated): ")
    input_symptoms = [sym.strip() for sym in user_input.split(',')]

    input_vector = get_sentence_embedding(input_symptoms)
    similarities = cosine_similarity([input_vector], disease_embeddings)[0]
    sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order

    print("\nDiseases with similar symptoms:")
    for idx in sorted_indices[:5]:  # Show top 5 similar diseases
        if similarities[idx] > 0:
            print(f"{disease_names[idx]} (Similarity: {similarities[idx]:.4f})")

    most_similar_index = sorted_indices[0]
    predicted_disease = disease_names[most_similar_index]

    print(f"\nPredicted Disease (FastText): {predicted_disease}")
    print(f"Explanation: The input symptoms were most similar to the symptoms of {predicted_disease} based on FastText embeddings, which capture subword relationships.")

    # Fetch and print all symptoms associated with the predicted disease
    all_symptoms = ', '.join(disease_symptoms_map[predicted_disease])
    print(f"\nAll Symptoms of {predicted_disease}: {all_symptoms}")

# Test FastText model
test_fasttext_model()

Enter symptoms (comma-separated):  Pounding sensation on one side, bright lights unbearable



Diseases with similar symptoms:
Common Cold (Similarity: 0.1057)
Migraine (Similarity: 0.1056)
Bronchitis (Similarity: 0.0754)
Strep Throat (Similarity: 0.0538)
Influenza (Similarity: 0.0517)

Predicted Disease (FastText): Common Cold
Explanation: The input symptoms were most similar to the symptoms of Common Cold based on FastText embeddings, which capture subword relationships.

All Symptoms of Common Cold: runny nose, sore throat, cough, sneezing, congestion, mild fever, sore throat, hoarse voice, mild cough, low energy, slight body aches, watery eyes, tickle in throat, post-nasal drip, mild fatigue, occasional cough, tickle in chest, mild ear pressure, mild swollen glands, scratchy throat, mild hoarseness, occasional sneezing


# BERT

In [15]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# 1. System Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 2. Load Data
def load_data():
    df = pd.read_csv('medical_symptoms_100.csv')
    df['Symptoms'] = df['Symptom'].apply(lambda x: eval(x) if isinstance(x, str) else [])
    df = df[df['Symptoms'].apply(len) > 0].reset_index(drop=True)
    df['text'] = df['Symptoms'].apply(lambda x: ', '.join(x))
    return df

df = load_data()
diseases = df['Disease'].unique().tolist()

# 3. Initialize BERT for Zero-Shot Classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')    #base-size BERT (12 layers)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()    #Turns off dropout layers and other training-specific behaviors.

# 4. Zero-Shot Prediction Function
def predict_zero_shot(text, candidate_labels):
    # Tokenize all candidate labels
    label_inputs = tokenizer(candidate_labels, padding=True, truncation=True, return_tensors='pt')
    label_inputs = {k: v.to(device) for k, v in label_inputs.items()}
    
    # Tokenize input text
    text_inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=64).to(device)
    
    with torch.no_grad():
        # Get embeddings for text
        text_outputs = model.bert(**text_inputs)    # Shape: [batch_size, sequence_length, hidden_size]
        text_embedding = text_outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        
        # Get embeddings for labels
        label_outputs = model.bert(**label_inputs)    
        label_embeddings = label_outputs.last_hidden_state.mean(dim=1)
        
        # Calculate cosine similarity
        similarities = torch.nn.functional.cosine_similarity(
            text_embedding.unsqueeze(1),
            label_embeddings.unsqueeze(0),
            dim=2
        )
        
        # Get most similar label
        pred_idx = torch.argmax(similarities).item()
        confidence = torch.softmax(similarities, dim=1)[0][pred_idx].item()
    
    return candidate_labels[pred_idx], confidence

# 5. Evaluation
true_labels = []
pred_labels = []
confidences = []

print("Running zero-shot predictions...")
for text, label in tqdm(zip(df['text'], df['Disease']), total=len(df)):
    pred, confidence = predict_zero_shot(text, diseases)
    true_labels.append(label)
    pred_labels.append(pred)
    confidences.append(confidence)

# 6. Performance Metrics
print("\nZero-Shot BERT Results:")
print(f"Accuracy: {accuracy_score(true_labels, pred_labels):.2%}")

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running zero-shot predictions...


100%|██████████| 80/80 [00:17<00:00,  4.59it/s]


Zero-Shot BERT Results:
Accuracy: 12.50%





In [51]:
# 7. Sample Predictions
print("\nSample Predictions:")
test_cases = [
    ['headache', 'fever'],
    ['cough', 'shortness', 'breath'],
    ['sore throat', 'fever'],
    ['pounding head pain', 'nausea', 'vomiting'],
    ['Pounding sensation on one side', 'bright lights unbearable']
]

for symptoms in test_cases:
    text = ', '.join(symptoms)
    pred, confidence = predict_zero_shot(text, diseases)
    print(f"\nSymptoms: {text}")
    print(f"Predicted: {pred} ({confidence:.2%} confidence)")


Sample Predictions:

Symptoms: headache, fever
Predicted: Migraine (11.28% confidence)

Symptoms: cough, shortness, breath
Predicted: Strep Throat (11.14% confidence)

Symptoms: sore throat, fever
Predicted: Sinusitis (11.14% confidence)

Symptoms: pounding head pain, nausea, vomiting
Predicted: Sinusitis (11.27% confidence)

Symptoms: Pounding sensation on one side, bright lights unbearable
Predicted: Migraine (10.92% confidence)


# Fine-tuned BERT

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import psutil
import gc

# 1. System Check
print(f"Available RAM: {psutil.virtual_memory().available / (1024**3):.1f} GB")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 2. Robust Data Loading
def load_data():
    try:
        df = pd.read_csv('medical_symptoms_100.csv')
        # Ensure we have required columns
        if not all(col in df.columns for col in ['Disease', 'Symptom']):
            raise ValueError("Missing required columns")
        
        # Convert symptoms safely
        def safe_convert(x):
            try:
                return eval(x) if isinstance(x, str) else []
            except:
                return []
        
        df['Symptoms'] = df['Symptom'].apply(safe_convert)
        df = df[df['Symptoms'].apply(len) > 0].reset_index(drop=True)
        df['text'] = df['Symptoms'].apply(lambda x: ' '.join(x))
        return df
    except Exception as e:
        print(f"Data loading failed: {str(e)}")
        exit()

df = load_data()

# 3. Label Encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Disease'])

# 4. Train-Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# 5. Tokenization with Proper Alignment
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_with_labels(texts, labels, batch_size=4, max_length=64):
    input_ids = []
    attention_masks = []    #Tell BERT which tokens are real (1) and which are padding (0).
    final_labels = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        
        encodings = tokenizer(
            batch_texts.tolist(),
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        
        input_ids.append(encodings['input_ids'])
        attention_masks.append(encodings['attention_mask'])
        final_labels.extend(batch_labels)
        
        torch.cuda.empty_cache() if torch.cuda.is_available() else gc.collect()
    
    return {
        'input_ids': torch.cat(input_ids),
        'attention_mask': torch.cat(attention_masks),
        'labels': torch.tensor(final_labels)
    }

print("Tokenizing training data...")
train_encodings = tokenize_with_labels(train_texts, train_labels)
print("Tokenizing test data...")
test_encodings = tokenize_with_labels(test_texts, test_labels)

# 6. Fixed Dataset Class
class SymptomsDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings['input_ids']
        self.attention_mask = encodings['attention_mask']
        self.labels = encodings['labels']
        
        # Validate lengths
        assert len(self.input_ids) == len(self.attention_mask) == len(self.labels), "Length mismatch in dataset"
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }
    
    def __len__(self):
        return len(self.labels)

train_dataset = SymptomsDataset(train_encodings)
test_dataset = SymptomsDataset(test_encodings)

# 7. Updated Model Definition
class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(768, num_classes)
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        pooled_output = outputs[1]  # Using pooled output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize model
model = BertClassifier(num_classes=len(label_encoder.classes_)).to(device)

# 8. Training Setup
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

optimizer = AdamW(model.parameters(), lr=3e-5)
loss_fn = nn.CrossEntropyLoss()

# 9. Training Loop
def train_epoch(model, dataloader, loss_fn, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        progress_bar.set_postfix({
            'loss': total_loss/total,
            'acc': correct/total
        })
        
        torch.cuda.empty_cache() if torch.cuda.is_available() else gc.collect()
    
    return total_loss/len(dataloader), correct/total

def evaluate(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            labels = batch['labels'].to(device)
            
            outputs = model(**inputs)
            loss = loss_fn(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            torch.cuda.empty_cache() if torch.cuda.is_available() else gc.collect()
    
    return total_loss/len(dataloader), correct/total

# 10. Main Training
best_accuracy = 0
for epoch in range(7):
    print(f"\nEpoch {epoch+1}")
    train_loss, train_acc = train_epoch(model, train_loader, loss_fn, optimizer)
    val_loss, val_acc = evaluate(model, test_loader, loss_fn)
    
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2%}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2%}")
    
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model.pt')
        print("Saved new best model")

# 11. Prediction Function
def predict(symptoms):
    text = ' '.join(symptoms)
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors='pt'
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs, dim=1)[0]
        top3 = torch.topk(probs, 3)
    
    return [
        (label_encoder.inverse_transform([idx.item()])[0], f"{prob.item():.2%}")
        for idx, prob in zip(top3.indices, top3.values)
    ]

Available RAM: 28.7 GB
Using device: cpu
Tokenizing training data...
Tokenizing test data...

Epoch 1


                                                                               

Train Loss: 2.3466 | Train Acc: 17.19%
Val Loss: 2.3027 | Val Acc: 12.50%
Saved new best model

Epoch 2


                                                                                

Train Loss: 2.3182 | Train Acc: 9.38%
Val Loss: 2.3112 | Val Acc: 6.25%

Epoch 3


                                                                               

Train Loss: 2.3197 | Train Acc: 17.19%
Val Loss: 2.2720 | Val Acc: 6.25%

Epoch 4


                                                                               

Train Loss: 2.2602 | Train Acc: 17.19%
Val Loss: 2.2506 | Val Acc: 12.50%

Epoch 5


                                                                                

Train Loss: 1.8491 | Train Acc: 39.06%
Val Loss: 2.1335 | Val Acc: 18.75%
Saved new best model

Epoch 6


                                                                                

Train Loss: 1.4552 | Train Acc: 71.88%
Val Loss: 2.1323 | Val Acc: 37.50%
Saved new best model

Epoch 7


                                                                                

Train Loss: 1.0612 | Train Acc: 90.62%
Val Loss: 1.9604 | Val Acc: 37.50%




Model:
[Tokenized Text]->[Pretrained BERT Encoder]->[Pooled [CLS] Token Output]->[Dropout]->[Linear Classification Layer]->[Logits for Each Class]

In [53]:
# Test predictions
print("\nSample Predictions:")
test_cases = [
    ['headache', 'fever'],
    ['cough', 'shortness', 'breath'],
    ['sore throat', 'fever'],
    ['pounding head pain', 'nausea', 'vomiting'],
    ['Pounding sensation on one side', 'bright lights unbearable']
]

for symptoms in test_cases:
    predictions = predict(symptoms)
    print(f"\nSymptoms: {', '.join(symptoms)}")
    for i, (disease, confidence) in enumerate(predictions, 1):
        print(f"{i}. {disease} ({confidence} confidence)")


Sample Predictions:

Symptoms: headache, fever
1. Migraine (43.85% confidence)
2. Influenza (16.06% confidence)
3. Bronchitis (11.12% confidence)

Symptoms: cough, shortness, breath
1. Bronchitis (29.55% confidence)
2. Common Cold (23.47% confidence)
3. Migraine (10.07% confidence)

Symptoms: sore throat, fever
1. Allergies (24.94% confidence)
2. Common Cold (20.27% confidence)
3. Bronchitis (20.10% confidence)

Symptoms: pounding head pain, nausea, vomiting
1. Migraine (35.78% confidence)
2. Bronchitis (12.10% confidence)
3. Common Cold (11.97% confidence)

Symptoms: Pounding sensation on one side, bright lights unbearable
1. Migraine (37.23% confidence)
2. Bronchitis (11.26% confidence)
3. Pneumonia (10.39% confidence)


In [22]:
# 1. Simple overview
print("\n" + "="*50)
print("Model Architecture Overview:")
print(model)
print("="*50 + "\n")

# 2. Detailed summary with torchinfo
try:
    from torchinfo import summary
    print("Detailed Model Summary:")
    summary(model, 
            input_size=[(2, 64), (2, 64)],  # (batch, seq_len) for input_ids and attention_mask
            dtypes=[torch.long, torch.long],
            col_names=["input_size", "output_size", "num_params", "trainable"],
            verbose=1)
except ImportError:
    print("Install torchinfo for detailed summary: pip install torchinfo")
    
    # 3. Fallback to parameter count
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTotal Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")


Model Architecture Overview:
BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [23]:
# Test a single batch
batch = next(iter(train_loader))
outputs = model(
    input_ids=batch['input_ids'].to(device),
    attention_mask=batch['attention_mask'].to(device)
)
print(f"Output shape: {outputs.shape}")  # Should be [batch_size, num_classes]

Output shape: torch.Size([2, 10])


In [24]:
probs = torch.softmax(outputs, dim=1)
print(f"Probabilities sum to 1: {torch.allclose(probs.sum(dim=1), torch.ones(probs.shape[0]).to(device))}")

Probabilities sum to 1: True


In [2]:
import pandas as pd
from io import StringIO

# 100-row medical symptom dataset
data = """
Disease,Symptom,Description
Common Cold,"['runny nose', 'sore throat', 'cough', 'sneezing']","Viral infection of upper respiratory tract"
Influenza,"['fever', 'body aches', 'fatigue', 'headache']","Viral infection causing systemic symptoms"
Allergies,"['sneezing', 'itchy eyes', 'nasal congestion']","Immune response to environmental triggers"
Migraine,"['throbbing headache', 'nausea', 'light sensitivity']","Neurological condition with severe headaches"
Gastroenteritis,"['diarrhea', 'nausea', 'stomach cramps']","Inflammation of stomach and intestines"
Bronchitis,"['persistent cough', 'chest congestion', 'fatigue']","Inflammation of bronchial tubes"
Sinusitis,"['facial pain', 'nasal congestion', 'headache']","Inflammation of sinus cavities"
Strep Throat,"['sore throat', 'fever', 'swollen lymph nodes']","Bacterial infection of the throat"
UTI,"['burning urination', 'frequent urination', 'pelvic pain']","Urinary tract infection"
Pneumonia,"['cough', 'fever', 'shortness of breath']","Lung infection causing inflammation"
Common Cold,"['congestion', 'mild fever', 'sore throat']","Mild viral infection affecting nose and throat"
Influenza,"['high fever', 'chills', 'muscle pain']","Acute respiratory illness with systemic effects"
Allergies,"['watery eyes', 'sneezing fits', 'postnasal drip']","Hypersensitivity reaction to allergens"
Migraine,"['aura', 'vomiting', 'sound sensitivity']","Severe headache disorder with sensory disturbances"
Gastroenteritis,"['vomiting', 'dehydration', 'loss of appetite']","Acute inflammation of gastrointestinal tract"
Bronchitis,"['wheezing', 'shortness of breath', 'mucus production']","Lower respiratory tract inflammation"
Sinusitis,"['toothache', 'bad breath', 'green nasal discharge']","Bacterial or viral sinus infection"
Strep Throat,"['white patches on throat', 'difficulty swallowing']","Streptococcal pharyngitis infection"
UTI,"['cloudy urine', 'strong urine odor', 'pelvic pressure']","Bacterial infection in urinary system"
Pneumonia,"['chest pain', 'sweating', 'rapid breathing']","Alveolar inflammation in lungs"
Common Cold,"['hoarse voice', 'mild cough', 'low energy']","Upper respiratory viral infection"
Influenza,"['extreme fatigue', 'dry cough', 'loss of appetite']","Seasonal flu with severe exhaustion"
Allergies,"['itchy throat', 'ear congestion', 'dark circles under eyes']","Chronic allergic response"
Migraine,"['visual disturbances', 'dizziness', 'neck pain']","Neurological headache with aura symptoms"
Gastroenteritis,"['bloating', 'gas', 'low-grade fever']","Stomach and intestinal inflammation"
Bronchitis,"['chest tightness', 'slight fever', 'wheezing']","Acute bronchial tube inflammation"
Sinusitis,"['ear pressure', 'reduced smell', 'jaw pain']","Chronic sinus cavity inflammation"
Strep Throat,"['red spots on roof of mouth', 'headache']","Streptococcal bacterial infection"
UTI,"['lower back pain', 'urge to urinate', 'blood in urine']","Possible kidney involvement"
Pneumonia,"['confusion (in elderly)', 'bluish lips', 'rapid pulse']","Severe lung infection with hypoxia"
Common Cold,"['slight body aches', 'watery eyes', 'tickle in throat']","Mild upper respiratory infection"
Influenza,"['stuffy nose', 'sore throat', 'eye pain']","Viral respiratory illness with systemic effects"
Allergies,"['itchy ears', 'scratchy throat', 'recurrent sneezing']","Seasonal allergic rhinitis"
Migraine,"['one-sided pain', 'worsened by movement', 'irritability']","Unilateral headache disorder"
Gastroenteritis,"['general weakness', 'muscle aches', 'mild headache']","Viral stomach flu with dehydration risk"
Bronchitis,"['hacking cough', 'soreness behind sternum', 'chills']","Acute viral bronchitis"
Sinusitis,"['pressure behind eyes', 'tooth sensitivity', 'ear fullness']","Acute bacterial sinusitis"
Strep Throat,"['sandpaper-like rash', 'stomach pain (in children)']","Scarlet fever manifestation"
UTI,"['pelvic discomfort', 'urinating small amounts']","Cystitis or lower UTI"
Pneumonia,"['shaking chills', 'nausea', 'diarrhea (in some cases)']","Lobar pneumonia presentation"
Common Cold,"['post-nasal drip', 'mild fatigue', 'occasional cough']","Post-viral upper respiratory symptoms"
Influenza,"['sudden onset fever', 'weakness', 'loss of taste']","Acute influenza infection"
Allergies,"['itchy palate', 'clear nasal discharge', 'cough']","Perennial allergic rhinitis"
Migraine,"['prodrome symptoms', 'yawning', 'food cravings']","Premonitory phase of migraine"
Gastroenteritis,"['abdominal tenderness', 'general malaise']","Infectious gastroenteritis"
Bronchitis,"['rattling sensation in chest', 'hoarseness']","Chronic bronchitis symptoms"
Sinusitis,"['thick yellow mucus', 'cough worse at night']","Purulent sinus infection"
Strep Throat,"['swollen tonsils', 'petechiae on palate']","Acute streptococcal tonsillitis"
UTI,"['feeling of incomplete emptying', 'pressure in lower abdomen']","Urinary retention symptoms"
Pneumonia,"['rust-colored sputum', 'mental confusion']","Pneumococcal pneumonia signs"
Common Cold,"['tickle in chest', 'mild ear pressure']","Post-viral cough symptoms"
Influenza,"['pain behind eyes', 'extreme exhaustion']","Severe influenza manifestation"
Allergies,"['itchy skin', 'recurrent throat clearing']","Atopic allergic response"
Migraine,"['neck stiffness', 'difficulty concentrating']","Cervicogenic migraine features"
Gastroenteritis,"['gurgling stomach', 'decreased urine output']","Dehydrating gastroenteritis"
Bronchitis,"['painful coughing', 'sore ribs from coughing']","Persistent bronchial inflammation"
Sinusitis,"['facial tenderness', 'blocked ears']","Acute maxillary sinusitis"
Strep Throat,"['abdominal pain in children', 'sandpaper rash']","Pediatric streptococcal infection"
UTI,"['fever (if kidney infection)', 'flank pain']","Pyelonephritis symptoms"
Pneumonia,"['grunting sounds when breathing', 'nasal flaring']","Severe respiratory distress"
Common Cold,"['mild swollen glands', 'scratchy throat']","Lymph node response to infection"
Influenza,"['sweating', 'shivering alternately']","Febrile influenza symptoms"
Allergies,"['itchy roof of mouth', 'snoring from congestion']","Chronic allergic congestion"
Migraine,"['light flashes', 'zigzag patterns in vision']","Migraine with visual aura"
Gastroenteritis,"['acidic taste in mouth', 'excessive thirst']","Gastroesophageal reflux symptoms"
Bronchitis,"['breathlessness', 'tight feeling in chest']","Obstructive airway symptoms"
Sinusitis,"['mucus dripping down throat', 'ear popping']","Chronic postnasal drip"
Strep Throat,"['difficulty opening mouth', 'red swollen tonsils']","Acute tonsillar inflammation"
UTI,"['foul-smelling urine', 'pelvic heaviness']","Bacterial urinary infection"
Pneumonia,"['shallow breathing', 'loss of appetite']","Atypical pneumonia presentation"
Common Cold,"['mild hoarseness', 'occasional sneezing']","Laryngitis with cold symptoms"
Influenza,"['joint pain', 'sensitivity to light']","Systemic influenza symptoms"
Allergies,"['itchy inner ears', 'recurrent sinus infections']","Chronic allergic sinusitis"
Migraine,"['tingling in face/hands', 'slurred speech']","Complex migraine with aura"
Gastroenteritis,"['pale skin', 'sunken eyes (in severe cases)']","Severe dehydration signs"
Bronchitis,"['blue-tinged lips (in severe cases)', 'fever over 100.4F']","Hypoxic respiratory distress"
Sinusitis,"['reduced sense of taste', 'earache']","Olfactory dysfunction in sinusitis"
Strep Throat,"['tiny red spots on tongue', 'strawberry tongue']","Scarlet fever tongue signs"
UTI,"['pain during intercourse', 'general unease']","Chronic urinary symptoms"
Pneumonia,"['clammy skin', 'fatigue disproportionate to activity']","Severe systemic infection"
"""

# Create DataFrame
df = pd.read_csv(StringIO(data))

# Save to CSV
df.to_csv('medical_symptoms_new_100.csv', index=False)
print("Dataset with 100 rows and complete descriptions created as 'medical_symptoms_new_100.csv'")

Dataset with 100 rows and complete descriptions created as 'medical_symptoms_new_100.csv'
