In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load and clean data
df = pd.read_csv("dataset.csv")
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().replace('nan', '')

# Create binary features
all_symptoms = sorted(set(df.iloc[:, 1:].values.flatten()))
all_symptoms = [s for s in all_symptoms if s != '']

X = pd.DataFrame(0, index=df.index, columns=all_symptoms)
for i in range(len(df)):
    for symptom in df.iloc[i, 1:].values:
        if symptom:
            X.loc[i, symptom] = 1

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = DecisionTreeClassifier(max_depth=8, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Simple prediction function
def predict(symptoms):
    vector = [1 if s in symptoms else 0 for s in all_symptoms]
    pred = model.predict([vector])[0]
    return label_encoder.inverse_transform([pred])[0]

print(f"Test: {predict(['itching', 'skin_rash'])}")

Accuracy: 0.1799
Test: Chronic cholestasis


