In [1]:
# -*- coding: utf-8 -*-
"""SympTriage_KNN.ipynb"""

!pip install kaggle
!pip install librosa
!pip install torch
!pip install scikit-learn
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

import kagglehub

# Download latest version
path = kagglehub.dataset_download("itachi9604/disease-symptom-description-dataset")

print("Path to dataset files:", path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, precision_score, roc_curve
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import warnings
import torch
import os
import pickle

warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

DATASET_PATH = path

class_dir = os.path.join(DATASET_PATH)
for filename in os.listdir(class_dir):
    print(filename)

# **Read and shuffle the dataset**
df = pd.read_csv(os.path.join(DATASET_PATH, 'dataset.csv'))
df = shuffle(df, random_state=42)

for col in df.columns:
    df[col] = df[col].str.replace('_', ' ')

null_checker = df.apply(lambda x: sum(x.isnull())).to_frame(name='count')

cols = df.columns
data = df[cols].values.flatten()
s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(df.shape)

df = pd.DataFrame(s, columns=df.columns)
df = df.fillna(0)

df1 = pd.read_csv(os.path.join(DATASET_PATH, 'Symptom-severity.csv'))
x = df1['Symptom']

dfx = pd.DataFrame()
dfx["Disease"] = df["Disease"]
dfx[x] = 0

for index, row in df.iterrows():
    for symptom in df.columns[1:]:
        if row[symptom] != 0:
            dfx.loc[index, row[symptom]] = 1

dfx = dfx.fillna(0)
dfx[dfx.columns[1:]] = dfx[dfx.columns[1:]].astype('int')
dfx.columns = dfx.columns.str.strip()

symptom_sums = dfx.iloc[:, 1:].sum(axis=0)
symptoms_with_no_values = symptom_sums[symptom_sums == 0].index.tolist()

columns_to_drop = symptoms_with_no_values
dfx = dfx.drop(columns=columns_to_drop)
dfx[dfx.columns[1:]].sum(axis=0).sort_values()

y = df['Disease'].unique()

data = dfx.iloc[:, 1:].values
labels = dfx['Disease'].values

x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=0.8, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Encode the target variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

y = le.classes_
y

# KNN model training
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

kfold = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=kfold, scoring='f1_weighted', verbose=1)
grid_search.fit(x_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
model = grid_search.best_estimator_

cv_results = pd.DataFrame(grid_search.cv_results_)

# Model evaluation
test_predictions = model.predict(x_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")
test_f1 = f1_score(y_test, test_predictions, average='weighted')
print(f'KNN test F1 Score: {test_f1:.4f}')

with open('knn_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("Best model saved as 'knn_model.pkl'")

with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(le, le_file)

# Function for predicting diseases
def preprocess_and_predict(user_input, model_file, label_encoder_file, symptom_columns):
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

    with open(label_encoder_file, 'rb') as le_file:
        le = pickle.load(le_file)

    user_df = pd.DataFrame([user_input], columns=symptom_columns)
    user_df.fillna(0, inplace=True)
    input_data = user_df.values

    prediction = model.predict(input_data)
    probabilities = model.predict_proba(input_data)

    predicted_label = le.inverse_transform(prediction)
    prob_dict = {le.inverse_transform([i])[0]: prob for i, prob in enumerate(probabilities[0])}

    return predicted_label[0], prob_dict

# Symptom columns
symptom_columns = ['itching', 'shivering', 'chills', 'acidity', 'vomiting', 'fatigue', 'anxiety', 'restlessness', 'lethargy', 'cough', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'nausea', 'constipation', 'diarrhoea', 'malaise', 'phlegm', 'congestion', 'dizziness', 'cramps', 'bruising', 'obesity', 'unsteadiness', 'depression', 'irritability', 'polyuria', 'coma', 'palpitations', 'blackheads', 'scurring', 'blister', 'skin rash', 'pus filled pimples', 'mood swings', 'weight loss', 'fast heart rate', 'excessive hunger', 'muscle weakness', 'abnormal menstruation', 'muscle wasting', 'patches in throat', 'high fever', 'extra marital contacts', 'yellowish skin', 'loss of appetite', 'abdominal pain', 'yellowing of eyes', 'chest pain', 'loss of balance', 'lack of concentration', 'blurred and distorted vision', 'drying and tingling lips', 'slurred speech', 'stiff neck', 'swelling joints', 'painful walking', 'dark urine', 'yellow urine', 'receiving blood transfusion', 'receiving unsterile injections', 'visual disturbances', 'burning micturition', 'bladder discomfort', 'foul smell of urine', 'continuous feel of urine', 'irregular sugar level', 'increased appetite', 'joint pain', 'skin peeling', 'small dents in nails', 'inflammatory nails', 'swelling of stomach', 'distention of abdomen', 'history of alcohol consumption', 'fluid overload', 'pain during bowel movements', 'pain in anal region', 'bloody stool', 'irritation in anus', 'acute liver failure', 'stomach bleeding', 'back pain', 'weakness in limbs', 'neck pain', 'mucoid sputum', 'mild fever', 'muscle pain', 'family history', 'continuous sneezing', 'watering from eyes', 'rusty sputum', 'weight gain', 'puffy face and eyes', 'enlarged thyroid', 'brittle nails', 'swollen extremeties', 'swollen legs', 'prominent veins on calf', 'stomach pain', 'spinning movements', 'sunken eyes', 'silver like dusting', 'swelled lymph nodes', 'blood in sputum', 'swollen blood vessels', 'toxic look (typhos)', 'belly pain', 'throat irritation', 'redness of eyes', 'sinus pressure', 'runny nose', 'loss of smell', 'passage of gases', 'cold hands and feets', 'weakness of one body side', 'altered sensorium', 'nodal skin eruptions', 'red sore around nose', 'yellow crust ooze', 'ulcers on tongue', 'spotting  urination', 'pain behind the eyes', 'red spots over body', 'internal itching', 'movement stiffness', 'knee pain', 'hip joint pain', 'dischromic  patches']

# Example user input
user_input = {
    'itching': 1,
    'skin_rash': 1,
    'nodal_skin_eruptions': 0,
    'continuous_sneezing': 1,
    'shivering': 1,
    'chills': 1,
    'joint_pain': 1,
    'stomach_pain': 1,
}

# Predict the disease
predicted_disease, probability = preprocess_and_predict(user_input, 'knn_model.pkl', 'label_encoder.pkl', symptom_columns)
print(f"Predicted Disease: {predicted_disease}")
print(f"Probability : {probability}")


cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Downloading from https://www.kaggle.com/api/v1/datasets/download/itachi9604/disease-symptom-description-dataset?dataset_version_number=2...


100%|██████████| 30.1k/30.1k [00:00<00:00, 34.8MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/itachi9604/disease-symptom-description-dataset/versions/2





Using device: cpu
dataset.csv
symptom_precaution.csv
Symptom-severity.csv
symptom_Description.csv
(3936, 131) (984, 131) (3936,) (984,)
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Test Accuracy: 1.0000
KNN test F1 Score: 1.0000
Best model saved as 'knn_model.pkl'
Predicted Disease: Allergy
Probability : {'(vertigo) Paroymsal  Positional Vertigo': 0.0, 'AIDS': 0.0, 'Acne': 0.0, 'Alcoholic hepatitis': 0.0, 'Allergy': 1.0, 'Arthritis': 0.0, 'Bronchial Asthma': 0.0, 'Cervical spondylosis': 0.0, 'Chicken pox': 0.0, 'Chronic cholestasis': 0.0, 'Common Cold': 0.0, 'Dengue': 0.0, 'Diabetes': 0.0, 'Dimorphic hemmorhoids(piles)': 0.0, 'Drug Reaction': 0.0, 'Fungal infection': 0.0, 'GERD': 0.0, 'Gastroenteritis': 0.0, 'Heart attack': 0.0, 'Hepatitis B': 0.0, 'Hepatitis C': 0.0, 'Hepatitis D': 0.0, 'Hepatitis E': 0.0, 'Hypertension': 0.0, 'Hyperthyroidism': 0.0, 'Hypoglycemia': 0.0, 'Hypothyroidism': 

In [3]:
user_input = {
    'chest_pain': 1,
    'phlegm': 1,
    'runny_nose': 1,
    'high_fever': 1,
    'throat_irritation': 1,
    'congestion': 1,
    'redness_of_eyes': 1,
}


# Predict the disease
predicted_disease, probability = preprocess_and_predict(user_input, 'knn_model.pkl', 'label_encoder.pkl', symptom_columns)
print(f"Predicted Disease: {predicted_disease}")
print(f"Probability : {probability}")

Predicted Disease: Fungal infection
Probability : {'(vertigo) Paroymsal  Positional Vertigo': 0.0, 'AIDS': 0.0, 'Acne': 0.0, 'Alcoholic hepatitis': 0.0, 'Allergy': 0.0, 'Arthritis': 0.0, 'Bronchial Asthma': 0.0, 'Cervical spondylosis': 0.0, 'Chicken pox': 0.0, 'Chronic cholestasis': 0.0, 'Common Cold': 0.0, 'Dengue': 0.0, 'Diabetes': 0.0, 'Dimorphic hemmorhoids(piles)': 0.0, 'Drug Reaction': 0.0, 'Fungal infection': 0.3333333333333333, 'GERD': 0.0, 'Gastroenteritis': 0.3333333333333333, 'Heart attack': 0.3333333333333333, 'Hepatitis B': 0.0, 'Hepatitis C': 0.0, 'Hepatitis D': 0.0, 'Hepatitis E': 0.0, 'Hypertension': 0.0, 'Hyperthyroidism': 0.0, 'Hypoglycemia': 0.0, 'Hypothyroidism': 0.0, 'Impetigo': 0.0, 'Jaundice': 0.0, 'Malaria': 0.0, 'Migraine': 0.0, 'Osteoarthristis': 0.0, 'Paralysis (brain hemorrhage)': 0.0, 'Peptic ulcer diseae': 0.0, 'Pneumonia': 0.0, 'Psoriasis': 0.0, 'Tuberculosis': 0.0, 'Typhoid': 0.0, 'Urinary tract infection': 0.0, 'Varicose veins': 0.0, 'hepatitis A': 0.0}