# ENT Symptom Checker using Random Forest

This notebook trains a machine learning model to predict ENT diseases based on patient symptoms using TF-IDF and Random Forest.

## Step 1: Load and Preprocess Dataset
We load the dataset and consolidate symptom-related columns into a single text field.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report, f1_score

# Load dataset
df = pd.read_csv("ENT symptoms dataset - no_dups_ent.csv")

# Define symptom columns explicitly
symptom_cols = ['Common Symptoms', 'Patient Reported Symptoms', 'Additional Symptoms']

# Handle NaNs and merge symptoms into single text column
df[symptom_cols] = df[symptom_cols].fillna('')
df['All Symptoms'] = df[symptom_cols].agg(' '.join, axis=1)

# Filter out diseases with fewer than 10 samples
df_filtered = df.groupby('Disease Name').filter(lambda x: len(x) >= 10)

# Check the remaining diseases
print(df_filtered['Disease Name'].value_counts())

# Encode disease labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_filtered['label'] = label_encoder.fit_transform(df_filtered['Disease Name'])

# Prepare TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df_filtered['All Symptoms'])
y = df_filtered['Disease Name']

# Train-test split (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Balancing training data via upsampling minority classes
df_train = pd.DataFrame(X_train.toarray())
df_train['Disease Name'] = y_train.values

# Upsample to balance classes
df_balanced = pd.concat([
    resample(
        df_train[df_train['Disease Name'] == label],
        replace=True,
        n_samples=df_train['Disease Name'].value_counts().max(),
        random_state=42
    )
    for label in df_train['Disease Name'].unique()
])

# Split back into X, y
X_train_balanced = df_balanced.drop('Disease Name', axis=1)
y_train_balanced = df_balanced['Disease Name']

# Train RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Macro F1-score:", f1_score(y_test, y_pred, average='macro'))


## Step 2: Filter Diseases with Fewer Than 10 Records

## Step 3: Encode Labels and Vectorize Text using TF-IDF

## Step 4: Train-Test Split and Data Balancing

## Step 5: Train Random Forest Classifier

## Step 6: Evaluate Model Performance