In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')
import operator

In [2]:
df = pd.read_csv('../desease_data/dataset.csv')
for i in range(len(df)):
    for j in range(1, len(df.columns)):
        if not pd.isna(df.iloc[i, j]):
            df.iloc[i,j] = df.iloc[i,j].replace(' ', '').replace('_', ' ')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


### Creaing a coulmn with a list of symptoms for each row

In [3]:
df["Symptoms"] = 0
records = df.shape[0]

for i in range(records):
    values = df.iloc[i].values
    values = values.tolist()
    if 0 in values:
        df["Symptoms"][i] = values[1:values.index(0)]
    else:
        df["Symptoms"][i] = values[1:]
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Symptoms
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,"[itching, skin rash, nodal skin eruptions, dis..."
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,"[skin rash, nodal skin eruptions, dischromic p..."
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,,"[itching, nodal skin eruptions, dischromic pat..."
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,,"[itching, skin rash, dischromic patches, nan, ..."
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,,"[itching, skin rash, nodal skin eruptions, nan..."


### Creating a DataFrame with symptoms as columns

In [4]:
column_values = df[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17']].values.ravel()

In [5]:
symps = pd.unique(column_values)
symps = symps.tolist()
symps = [i for i in symps if str(i) != "nan"]

In [6]:
symptoms = pd.DataFrame(columns = symps,index = df.index)
symptoms["Symptoms"] = df["Symptoms"]
symptoms.head()

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell ofurine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,Symptoms
0,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, nodal skin eruptions, dis..."
1,,,,,,,,,,,...,,,,,,,,,,"[skin rash, nodal skin eruptions, dischromic p..."
2,,,,,,,,,,,...,,,,,,,,,,"[itching, nodal skin eruptions, dischromic pat..."
3,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, dischromic patches, nan, ..."
4,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, nodal skin eruptions, nan..."


In [7]:
for i in symps:
    symptoms[i] = symptoms.apply(lambda x:1 if i in x.Symptoms else 0, axis=1)
    
symptoms["Disease"] = df["Disease"]
symptoms = symptoms.drop("Symptoms",axis=1)
symptoms.head()

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell ofurine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,Disease
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [8]:
train, test = train_test_split(symptoms,test_size=0.2)
X_train = train.drop("Disease",axis=1)
y_train = train["Disease"].copy()
X_test = test.drop("Disease",axis=1)
y_test = test["Disease"].copy()

In [9]:
rnd_forest = SVC(probability=True)
rnd_forest.fit(X_train,y_train)

In [10]:
cross_val_score(rnd_forest,X_train,y_train,cv=10).mean()

1.0

In [11]:
y_pred = rnd_forest.predict(X_test)
f1_score(y_test,y_pred, average="weighted")

1.0

In [12]:
def predict_disease(new_data, model):
    s = pd.DataFrame(columns = symps)
    s.loc[0] = 0
    for i in symps:
        s[i] = s.apply(lambda x:1 if i in new_data else 0, axis=1)
    return model.predict_proba(s)[0]

In [28]:
new_symptoms = ['itching', 'nodal skin eruptions','dischromic patches', 'fatigue', 'vomiting']

predictions = predict_disease(new_symptoms, rnd_forest)
prob_per_class_dictionary = dict(zip(rnd_forest.classes_, predictions))
sorted_prob_per_class_dictionary = sorted(prob_per_class_dictionary.items(), key=operator.itemgetter(1), reverse=True)
prediction1 = sorted_prob_per_class_dictionary[0][0]
confidence1 = "{:.2%}".format(sorted_prob_per_class_dictionary[0][1])
prediction2 = sorted_prob_per_class_dictionary[1][0]
confidence2 = "{:.2%}".format(sorted_prob_per_class_dictionary[1][1])

In [29]:
print("The patient may have " + prediction1 + " with " + confidence1 + " confidence.")

The patient may have Fungal infection with 28.78% confidence.


In [30]:
print("The patient may have " + prediction2 + " with " + confidence2 + " confidence.")

The patient may have Heart attack with 4.43% confidence.
