<a href="https://colab.research.google.com/github/khurramahmed/diseaseprediction/blob/main/diseaseprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

In [4]:
trainurl = 'https://raw.githubusercontent.com/khurramahmed/diseaseprediction/main/Training.csv'

In [44]:
train = pd.read_csv(trainurl)
train = train.iloc[:,0:133]

In [None]:
train.head()

In [46]:
print(train['prognosis'].unique())

['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes ' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension ' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo']


In [None]:
le = LabelEncoder()
train.iloc[:,132] = le.fit_transform(train.iloc[:,132])
train.head()

In [13]:
def Encoder(df):
          columnsToEncode = list(df.select_dtypes(include=['category','object']))
          le = LabelEncoder()
          for feature in columnsToEncode:
              try:
                  df[feature] = le.fit_transform(df[feature])
              except:
                  print('Error encoding '+feature)
          return df

In [38]:
def Decoder(df):
          columnsToEncode = list(df.select_dtypes(include=['category','object']))
          le = LabelEncoder()
          for feature in columnsToEncode:
              try:
                  df[feature] = le.inverse_transform(df[feature])
              except:
                  print('Error encoding '+feature)
          return df

In [None]:
train = Encoder(train)
train.head()

In [None]:
train.isnull()

In [59]:
y = train.iloc[:,-1] # Training labels

In [60]:
x = train.iloc[:,0:132] # Training features

In [None]:
x.head()

In [None]:
y.head()

In [None]:
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) # Higher the score, more important is the feature towards explaining the output variable.

In [64]:
importance = pd.Series(model.feature_importances_, index = x.columns)

In [None]:
importance.nlargest(20).index

In [None]:
importance.nlargest(20).plot(kind = 'barh')

In [67]:
for_matrix = train[['muscle_pain', 'itching', 'yellowing_of_eyes', 'unsteadiness',
       'vomiting', 'altered_sensorium', 'dark_urine', 'diarrhoea',
       'chest_pain', 'lack_of_concentration', 'abnormal_menstruation',
       'fatigue', 'mild_fever', 'joint_pain', 'mucoid_sputum',
       'muscle_weakness', 'high_fever', 'increased_appetite',
       'passage_of_gases', 'prognosis']]

In [None]:
for_matrix.head()

In [69]:
corrmat = for_matrix.corr()
corr_important_features = corrmat.index

In [None]:
plt.figure(figsize = (20,20))
corr_heatmap = sns.heatmap(for_matrix[corr_important_features].corr(), annot = True, cmap = 'RdYlGn')

In [None]:
# Variables with high correlation to prognosis are itching unsteadiness (-0.26), diarrhoea (0.29), mild_fever (0.2), and joint_pain (0.23)

In [None]:
classifier = KNeighborsClassifier(n_neighbors=101, metric = 'hamming')
classifier.fit(x, y)

In [72]:
testurl= "https://raw.githubusercontent.com/khurramahmed/diseaseprediction/main/Testing.csv"

In [None]:
test = pd.read_csv(testurl)
test.iloc[:,132] = le.fit_transform(test.iloc[:,132])
test.head()

In [102]:
y_test = test.iloc[:,-1]
x_test = test.iloc[:,0:132]

In [103]:
y_pred = classifier.predict(x_test)

In [104]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15   