In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import shap
from sklearn.model_selection import train_test_split
import joblib

In [2]:
train_dataset=pd.read_csv("/kaggle/input/disease-prediction-using-machine-learning/Training.csv")
train_dataset.columns.tolist()

['itching',
 'skin_rash',
 'nodal_skin_eruptions',
 'continuous_sneezing',
 'shivering',
 'chills',
 'joint_pain',
 'stomach_pain',
 'acidity',
 'ulcers_on_tongue',
 'muscle_wasting',
 'vomiting',
 'burning_micturition',
 'spotting_ urination',
 'fatigue',
 'weight_gain',
 'anxiety',
 'cold_hands_and_feets',
 'mood_swings',
 'weight_loss',
 'restlessness',
 'lethargy',
 'patches_in_throat',
 'irregular_sugar_level',
 'cough',
 'high_fever',
 'sunken_eyes',
 'breathlessness',
 'sweating',
 'dehydration',
 'indigestion',
 'headache',
 'yellowish_skin',
 'dark_urine',
 'nausea',
 'loss_of_appetite',
 'pain_behind_the_eyes',
 'back_pain',
 'constipation',
 'abdominal_pain',
 'diarrhoea',
 'mild_fever',
 'yellow_urine',
 'yellowing_of_eyes',
 'acute_liver_failure',
 'fluid_overload',
 'swelling_of_stomach',
 'swelled_lymph_nodes',
 'malaise',
 'blurred_and_distorted_vision',
 'phlegm',
 'throat_irritation',
 'redness_of_eyes',
 'sinus_pressure',
 'runny_nose',
 'congestion',
 'chest_pain',


In [3]:
train_dataset.pop('Unnamed: 133')
target=train_dataset['prognosis']
train_dataset.pop('prognosis')

0                              Fungal infection
1                              Fungal infection
2                              Fungal infection
3                              Fungal infection
4                              Fungal infection
                         ...                   
4915    (vertigo) Paroymsal  Positional Vertigo
4916                                       Acne
4917                    Urinary tract infection
4918                                  Psoriasis
4919                                   Impetigo
Name: prognosis, Length: 4920, dtype: object

In [4]:
for i in train_dataset.isnull().sum():
    if i!=0:
        print(i)

In [5]:
print([i for i in train_dataset.columns.tolist() if len(train_dataset[i])!=len(train_dataset['itching'])])

[]


In [6]:
print([train_dataset[i] for i in train_dataset.columns.tolist() if train_dataset[i].dtype!='int64'])

[]


In [7]:
classification_models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_jobs=-1, random_state=666),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier()
}

In [8]:
encoder = LabelEncoder()
target = encoder.fit_transform(target)

In [9]:
def train_models(X,y,model,i):
    train_x,test_x,train_y,test_y=train_test_split(X,y,test_size=0.2, random_state=42)
    model_fitted=model.fit(train_x,train_y)
    predictions=model_fitted.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    precision = precision_score(test_y, predictions, average='weighted')
    recall = recall_score(test_y, predictions, average='weighted')
    f1 = f1_score(test_y, predictions, average='weighted')
    return {"F1-Score":f1,
           "Precision":precision,
           "Recall":recall,
           "Accuracy":accuracy,}

In [10]:
for i in classification_models:
    print("Model : "+i)
    print(train_models(train_dataset,target,classification_models[i],i))

Model : Logistic Regression
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : Decision Tree
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : Random Forest
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : Gradient Boosting
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : SVM
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : K-Nearest Neighbors
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : Naive Bayes
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}
Model : Neural Network
{'F1-Score': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'Accuracy': 1.0}


In [11]:
rf_model = RandomForestClassifier()
param_dist = {
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_random = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
rf_random.fit(train_dataset,target)
print("Best Hyperparameters:", rf_random.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


The total space of parameters 18 is smaller than n_iter=100. Running 18 iterations. For exhaustive searches, use GridSearchCV.


Best Hyperparameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}


In [12]:
rf_model.fit(train_dataset,target)

In [13]:
best_rf_model = rf_random.best_estimator_
joblib.dump(rf_model , "./random_forest1.joblib")

['./random_forest1.joblib']

In [14]:
train_dataset1=pd.read_csv("/kaggle/input/disease-prediction-using-machine-learning/Training.csv")
print(set(train_dataset1['prognosis']),set(target))

{'Impetigo', 'Hepatitis D', 'Bronchial Asthma', 'Acne', 'AIDS', 'Dimorphic hemmorhoids(piles)', 'Common Cold', 'Dengue', 'Hepatitis E', 'Hypertension ', 'Varicose veins', 'Jaundice', 'Psoriasis', 'Chronic cholestasis', 'Hypoglycemia', 'Drug Reaction', 'Peptic ulcer diseae', 'Cervical spondylosis', 'Hepatitis B', 'Paralysis (brain hemorrhage)', 'Tuberculosis', 'Hyperthyroidism', 'Gastroenteritis', 'Osteoarthristis', 'GERD', 'Chicken pox', '(vertigo) Paroymsal  Positional Vertigo', 'Migraine', 'Malaria', 'Hypothyroidism', 'Typhoid', 'Fungal infection', 'Urinary tract infection', 'Pneumonia', 'hepatitis A', 'Diabetes ', 'Heart attack', 'Arthritis', 'Allergy', 'Hepatitis C', 'Alcoholic hepatitis'} {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40}
