In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [66]:
df = pd.read_csv('/content/dataset.csv')
data_severity = pd.read_csv('/content/Symptom-severity.csv')

In [67]:
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [68]:
data_severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [69]:
df.shape

(4920, 18)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 13  Symptom_13  504 non-null    object
 14  Symptom_14  306 non-null    object
 15  Symptom_15  240 non-null    object
 16  Symptom_16  192 non-null    object
 17  Symptom_17  72 non-null     object
dtypes: object(18)
memory usage: 692.0+ KB


In [71]:
def remove_space_between_words(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip().str.replace(" ", "_")
    return df

In [72]:
df = remove_space_between_words(df)
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal_infection,itching,skin_rash,nodal_skin_eruptions,dischromic__patches,,,,,,,,,,,,,
1,Fungal_infection,skin_rash,nodal_skin_eruptions,dischromic__patches,,,,,,,,,,,,,,
2,Fungal_infection,itching,nodal_skin_eruptions,dischromic__patches,,,,,,,,,,,,,,
3,Fungal_infection,itching,skin_rash,dischromic__patches,,,,,,,,,,,,,,
4,Fungal_infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [73]:
df[df['Disease']=='Acne'].values

array([['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'pus_filled_pimples', 'blackheads', ..., nan, nan, nan],
       ...,
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan],
       ['Acne', 'skin_rash', 'pus_filled_pimples', ..., nan, nan, nan]],
      dtype=object)

In [74]:
def encode_symptoms(df, data_severity):
    for i in data_severity.index:
        symptom = data_severity["Symptom"][i]
        weight = data_severity["weight"][i]
        df = df.replace(symptom, weight)

    # Replace missing values with 0
    df = df.fillna(0)

    # Additional hardcoded replacements
    df = df.replace("foul_smell_of_urine", 5)
    df = df.replace("dischromic__patches", 6)
    df = df.replace("spotting__urination", 6)

    return df

In [75]:
new_df = encode_symptoms(df, data_severity)

In [76]:
new_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal_infection,1,3,4,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Fungal_infection,3,4,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Fungal_infection,1,4,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Fungal_infection,1,3,6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Fungal_infection,1,3,4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
names = []

# Iterate through columns except for "Disease"
for col in new_df.columns:
    if col != "Disease":
        # Iterate through rows in the column
        for symptom in new_df[col]:
            # Check if the value is a string and not in the 'names' list
            if isinstance(symptom, str) and symptom not in names:
                names.append(symptom)

# Check if all symptoms have been replaced
all_replaced = all(symptom not in names for symptom in data_severity["Symptom"])

if all_replaced:
    print("All symptoms have been replaced.")
else:
    print("The following symptoms were not replaced:", names)

All symptoms have been replaced.


In [78]:
# separating the data and labels
X = new_df.drop(columns='Disease', axis=1)
Y = new_df['Disease']

In [79]:
print(X)

      Symptom_1  Symptom_2  Symptom_3  Symptom_4  Symptom_5  Symptom_6  \
0             1          3          4          6          0        0.0   
1             3          4          6          0          0        0.0   
2             1          4          6          0          0        0.0   
3             1          3          6          0          0        0.0   
4             1          3          4          0          0        0.0   
...         ...        ...        ...        ...        ...        ...   
4915          5          3          5          6          4        4.0   
4916          3          2          2          2          0        0.0   
4917          6          4          5          6          0        0.0   
4918          3          3          3          2          2        2.0   
4919          3          7          4          2          3        0.0   

      Symptom_7  Symptom_8  Symptom_9  Symptom_10  Symptom_11  Symptom_12  \
0           0.0        0.0        

In [80]:
print(Y)

0                              Fungal_infection
1                              Fungal_infection
2                              Fungal_infection
3                              Fungal_infection
4                              Fungal_infection
                         ...                   
4915    (vertigo)_Paroymsal__Positional_Vertigo
4916                                       Acne
4917                    Urinary_tract_infection
4918                                  Psoriasis
4919                                   Impetigo
Name: Disease, Length: 4920, dtype: object


Data Standardization

In [81]:
scaler = StandardScaler()

In [82]:
scaler.fit(X)

In [83]:
standardized_data = scaler.transform(X)

In [84]:
print(standardized_data)

[[-1.83180372 -0.96557578 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 [-1.83180372 -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 ...
 [ 1.96708109 -0.13736225  0.66193895 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.96557578 -0.93921132 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498   2.34727837 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]]


In [85]:
X = standardized_data
Y = new_df['Disease']

In [86]:
print(X)

[[-1.83180372 -0.96557578 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 [-1.83180372 -0.13736225  1.46251409 ... -0.21790657 -0.18978799
  -0.12186667]
 ...
 [ 1.96708109 -0.13736225  0.66193895 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498  -0.96557578 -0.93921132 ... -0.21790657 -0.18978799
  -0.12186667]
 [-0.3122498   2.34727837 -0.13863618 ... -0.21790657 -0.18978799
  -0.12186667]]


In [87]:
print(Y)

0                              Fungal_infection
1                              Fungal_infection
2                              Fungal_infection
3                              Fungal_infection
4                              Fungal_infection
                         ...                   
4915    (vertigo)_Paroymsal__Positional_Vertigo
4916                                       Acne
4917                    Urinary_tract_infection
4918                                  Psoriasis
4919                                   Impetigo
Name: Disease, Length: 4920, dtype: object


Random Forest Classifier

In [88]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [89]:
rfc_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_classifier.fit(X_train, Y_train)

In [90]:
Y_pred = rfc_classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

0.991869918699187

In [91]:
pd.crosstab(Y_test,Y_pred)

col_0,(vertigo)_Paroymsal__Positional_Vertigo,AIDS,Acne,Alcoholic_hepatitis,Allergy,Arthritis,Bronchial_Asthma,Cervical_spondylosis,Chicken_pox,Chronic_cholestasis,...,Osteoarthristis,Paralysis_(brain_hemorrhage),Peptic_ulcer_diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary_tract_infection,Varicose_veins,hepatitis_A
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo)_Paroymsal__Positional_Vertigo,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AIDS,0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acne,0,0,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alcoholic_hepatitis,0,0,0,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Allergy,0,0,0,0,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arthritis,0,0,0,0,0,23,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bronchial_Asthma,0,0,0,0,0,0,33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cervical_spondylosis,0,0,0,0,0,0,0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
Chicken_pox,0,0,0,0,0,0,0,0,21,0,...,0,0,0,0,0,0,0,0,0,0
Chronic_cholestasis,0,0,0,0,0,0,0,0,0,15,...,0,0,0,0,0,0,0,0,0,0


In [92]:
print(confusion_matrix(Y_test,Y_pred))

[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [93]:
print(classification_report(Y_test,Y_pred))

                                         precision    recall  f1-score   support

(vertigo)_Paroymsal__Positional_Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic_hepatitis       1.00      1.00      1.00        25
                                Allergy       0.92      1.00      0.96        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial_Asthma       1.00      1.00      1.00        33
                   Cervical_spondylosis       1.00      0.87      0.93        23
                            Chicken_pox       1.00      1.00      1.00        21
                    Chronic_cholestasis       1.00      1.00      1.00        15
                            Common_Cold       1.00      1.00      1.00        23
                           

In [94]:
print( precision_score(Y_test, Y_pred, average='macro'))

0.9928005598737307


In [95]:
print(recall_score(Y_test, Y_pred,average='macro'))

0.9909757027776227


In [96]:
print(f1_score(Y_test, Y_pred, average='macro'))

0.9914564591749411


Decision Tree Classifier

In [97]:
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [98]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, Y_train)

In [99]:
Y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

0.991869918699187

In [100]:
pd.crosstab(Y_test,Y_pred)

col_0,(vertigo)_Paroymsal__Positional_Vertigo,AIDS,Acne,Alcoholic_hepatitis,Allergy,Arthritis,Bronchial_Asthma,Cervical_spondylosis,Chicken_pox,Chronic_cholestasis,...,Osteoarthristis,Paralysis_(brain_hemorrhage),Peptic_ulcer_diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary_tract_infection,Varicose_veins,hepatitis_A
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo)_Paroymsal__Positional_Vertigo,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AIDS,0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acne,0,0,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alcoholic_hepatitis,0,0,0,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Allergy,0,0,0,0,24,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arthritis,0,0,0,0,0,23,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bronchial_Asthma,0,0,0,0,0,0,33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cervical_spondylosis,0,0,0,0,0,0,0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
Chicken_pox,0,0,0,0,0,0,0,0,21,0,...,0,0,0,0,0,0,0,0,0,0
Chronic_cholestasis,0,0,0,0,0,0,0,0,0,15,...,0,0,0,0,0,0,0,0,0,0


In [101]:
print(confusion_matrix(Y_test,Y_pred))

[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [102]:
print(classification_report(Y_test,Y_pred))

                                         precision    recall  f1-score   support

(vertigo)_Paroymsal__Positional_Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic_hepatitis       1.00      1.00      1.00        25
                                Allergy       0.92      1.00      0.96        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial_Asthma       1.00      1.00      1.00        33
                   Cervical_spondylosis       1.00      0.87      0.93        23
                            Chicken_pox       1.00      1.00      1.00        21
                    Chronic_cholestasis       1.00      1.00      1.00        15
                            Common_Cold       1.00      1.00      1.00        23
                           

In [103]:
print( precision_score(Y_test, Y_pred, average='macro'))

0.9928005598737307


In [104]:
print(recall_score(Y_test, Y_pred,average='macro'))

0.9909757027776227


In [105]:
print(f1_score(Y_test, Y_pred, average='macro'))

0.9914564591749411


Gaussian Naive Bayes classifier

In [106]:
from sklearn.naive_bayes import GaussianNB
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [107]:
NB_classifier = GaussianNB()
NB_classifier.fit(X_train, Y_train)

In [108]:
Y_pred = NB_classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

0.8678861788617886

In [109]:
pd.crosstab(Y_test,Y_pred)

col_0,(vertigo)_Paroymsal__Positional_Vertigo,AIDS,Acne,Alcoholic_hepatitis,Allergy,Arthritis,Bronchial_Asthma,Cervical_spondylosis,Chicken_pox,Chronic_cholestasis,...,Osteoarthristis,Paralysis_(brain_hemorrhage),Peptic_ulcer_diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary_tract_infection,Varicose_veins,hepatitis_A
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo)_Paroymsal__Positional_Vertigo,15,0,0,0,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
AIDS,0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acne,0,0,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alcoholic_hepatitis,4,0,0,19,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Allergy,0,0,0,0,19,0,0,0,0,0,...,0,5,0,0,0,0,0,0,0,0
Arthritis,0,2,0,0,2,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bronchial_Asthma,0,0,0,0,0,0,23,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cervical_spondylosis,0,0,0,0,0,0,0,14,0,0,...,0,0,0,0,0,0,0,0,0,0
Chicken_pox,0,0,0,0,0,0,0,0,19,0,...,0,0,0,2,0,0,0,0,0,0
Chronic_cholestasis,2,0,0,0,0,0,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0


In [110]:
print(confusion_matrix(Y_test,Y_pred))

[[15  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  5  0 ... 21  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 33]]


In [111]:
print(classification_report(Y_test,Y_pred))

                                         precision    recall  f1-score   support

(vertigo)_Paroymsal__Positional_Vertigo       0.71      0.83      0.77        18
                                   AIDS       0.68      1.00      0.81        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic_hepatitis       1.00      0.76      0.86        25
                                Allergy       0.90      0.79      0.84        24
                              Arthritis       1.00      0.83      0.90        23
                       Bronchial_Asthma       1.00      0.70      0.82        33
                   Cervical_spondylosis       0.54      0.61      0.57        23
                            Chicken_pox       1.00      0.90      0.95        21
                    Chronic_cholestasis       1.00      0.87      0.93        15
                            Common_Cold       1.00      1.00      1.00        23
                           

In [112]:
print( precision_score(Y_test, Y_pred, average='macro'))

0.8794765795368534


In [113]:
print(recall_score(Y_test, Y_pred,average='macro'))

0.8733242259602846


In [114]:
print(f1_score(Y_test, Y_pred, average='macro'))

0.8628509936790025


K-Nearest Neighbors Classifier

In [115]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [116]:
k=3

In [117]:
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, Y_train)

In [118]:
Y_pred = knn_classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

0.991869918699187

In [119]:
pd.crosstab(Y_test,Y_pred)

col_0,(vertigo)_Paroymsal__Positional_Vertigo,AIDS,Acne,Alcoholic_hepatitis,Allergy,Arthritis,Bronchial_Asthma,Cervical_spondylosis,Chicken_pox,Chronic_cholestasis,...,Osteoarthristis,Paralysis_(brain_hemorrhage),Peptic_ulcer_diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary_tract_infection,Varicose_veins,hepatitis_A
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo)_Paroymsal__Positional_Vertigo,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AIDS,0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acne,0,0,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alcoholic_hepatitis,0,0,0,25,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Allergy,0,0,0,0,23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arthritis,0,0,0,0,0,23,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bronchial_Asthma,0,0,0,0,0,0,33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cervical_spondylosis,0,0,0,0,0,0,0,20,0,0,...,0,0,0,0,0,0,0,0,0,0
Chicken_pox,0,0,0,0,0,0,0,0,21,0,...,0,0,0,0,0,0,0,0,0,0
Chronic_cholestasis,0,0,0,0,0,0,0,0,0,15,...,0,0,0,0,0,0,0,0,0,0


In [120]:
print(confusion_matrix(Y_test,Y_pred))

[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


In [121]:
print(classification_report(Y_test,Y_pred))

                                         precision    recall  f1-score   support

(vertigo)_Paroymsal__Positional_Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic_hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      0.96      0.98        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial_Asthma       1.00      1.00      1.00        33
                   Cervical_spondylosis       1.00      0.87      0.93        23
                            Chicken_pox       1.00      1.00      1.00        21
                    Chronic_cholestasis       1.00      1.00      1.00        15
                            Common_Cold       1.00      1.00      1.00        23
                           

In [122]:
print( precision_score(Y_test, Y_pred, average='macro'))

0.9933217189314751


In [123]:
print(recall_score(Y_test, Y_pred,average='macro'))

0.9909350523711186


In [124]:
print(f1_score(Y_test, Y_pred, average='macro'))

0.9915720052790706


K-Fold Cross Validation

In [125]:
from sklearn.model_selection import cross_val_score, KFold


In [138]:
from sklearn.svm import SVC
svm_classifier = SVC()

In [139]:
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [140]:
scores = cross_val_score(svm_classifier, X, Y, cv=kf, scoring='accuracy')


In [141]:
print(f'Cross-Validation Scores: {scores}')

Cross-Validation Scores: [0.96544715 0.95934959 0.98373984 0.95121951 0.97764228 0.95528455
 0.96138211 0.97764228 0.95325203 0.96544715]


In [142]:
mean_score = scores.mean()
std_deviation = scores.std()
precision = precision_score(Y_test, Y_pred, average='macro')
recall = recall_score(Y_test, Y_pred,average='macro')
f1 = f1_score(Y_test, Y_pred, average='macro')

In [143]:
print(f'Mean Accuracy: {mean_score}')
print(f'Standard Deviation: {std_deviation}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Mean Accuracy: 0.9650406504065041
Standard Deviation: 0.010670247762932275
Accuracy: 0.9705284552845529
Precision: 0.9735474552547722
Recall: 0.9712520287698774
F1-Score: 0.971132617679643


SVM

In [128]:
from sklearn.svm import SVC
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=100)

In [129]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, Y_train)


In [130]:
Y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

0.9705284552845529

In [131]:
pd.crosstab(Y_test,Y_pred)

col_0,(vertigo)_Paroymsal__Positional_Vertigo,AIDS,Acne,Alcoholic_hepatitis,Allergy,Arthritis,Bronchial_Asthma,Cervical_spondylosis,Chicken_pox,Chronic_cholestasis,...,Osteoarthristis,Paralysis_(brain_hemorrhage),Peptic_ulcer_diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary_tract_infection,Varicose_veins,hepatitis_A
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo)_Paroymsal__Positional_Vertigo,30,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AIDS,0,23,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acne,0,0,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alcoholic_hepatitis,0,0,0,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Allergy,0,0,0,0,21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arthritis,0,0,0,0,0,21,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bronchial_Asthma,0,0,0,0,0,0,26,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cervical_spondylosis,0,0,0,0,0,0,0,22,0,0,...,0,0,0,0,0,0,0,0,0,0
Chicken_pox,0,0,0,0,0,0,0,0,21,0,...,0,0,0,0,0,0,0,0,0,0
Chronic_cholestasis,0,0,0,0,0,0,0,0,0,31,...,0,0,0,0,0,0,0,0,0,0


In [132]:
print(confusion_matrix(Y_test,Y_pred))

[[30  0  0 ...  0  0  0]
 [ 0 23  0 ...  0  0  0]
 [ 0  0 34 ...  0  0  0]
 ...
 [ 0  0  0 ... 27  0  0]
 [ 0  0  0 ...  0 25  0]
 [ 0  0  0 ...  0  0 26]]


In [134]:
print(classification_report(Y_test,Y_pred))

                                         precision    recall  f1-score   support

(vertigo)_Paroymsal__Positional_Vertigo       1.00      0.97      0.98        31
                                   AIDS       1.00      1.00      1.00        23
                                   Acne       1.00      1.00      1.00        34
                    Alcoholic_hepatitis       1.00      0.89      0.94        19
                                Allergy       0.81      1.00      0.89        21
                              Arthritis       1.00      1.00      1.00        21
                       Bronchial_Asthma       0.96      0.84      0.90        31
                   Cervical_spondylosis       0.85      1.00      0.92        22
                            Chicken_pox       1.00      1.00      1.00        21
                    Chronic_cholestasis       1.00      1.00      1.00        31
                            Common_Cold       1.00      1.00      1.00        26
                           

In [133]:
print( precision_score(Y_test, Y_pred, average='macro'))

0.9735474552547722


In [135]:
print(recall_score(Y_test, Y_pred,average='macro'))

0.9712520287698774


In [136]:
print(f1_score(Y_test, Y_pred, average='macro'))

0.971132617679643


In [144]:
import pickle


pickle.dump(rfc_classifier,open('startup.pkl','wb'))