# Disease Detection from Symptoms

### Importing libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("datasets/Symptom2Disease.csv",index_col = 0)
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


### Data Preparation

In [3]:
df.label.unique()
df['label_num']= df.label.map(dict(zip(pd.Series(df['label'].unique()),pd.Series([i for i in range(24)]))))
                              
df.head()

Unnamed: 0,label,text,label_num
0,Psoriasis,I have been experiencing a skin rash on my arm...,0
1,Psoriasis,"My skin has been peeling, especially on my kne...",0
2,Psoriasis,I have been experiencing joint pain in my fing...,0
3,Psoriasis,"There is a silver like dusting on my skin, esp...",0
4,Psoriasis,"My nails have small dents or pits in them, and...",0


In [4]:
lookup = dict(zip(pd.Series(df['label'].unique()),pd.Series([i for i in range(24)])))

In [5]:
print(lookup)

{'Psoriasis': 0, 'Varicose Veins': 1, 'Typhoid': 2, 'Chicken pox': 3, 'Impetigo': 4, 'Dengue': 5, 'Fungal infection': 6, 'Common Cold': 7, 'Pneumonia': 8, 'Dimorphic Hemorrhoids': 9, 'Arthritis': 10, 'Acne': 11, 'Bronchial Asthma': 12, 'Hypertension': 13, 'Migraine': 14, 'Cervical spondylosis': 15, 'Jaundice': 16, 'Malaria': 17, 'urinary tract infection': 18, 'allergy': 19, 'gastroesophageal reflux disease': 20, 'drug reaction': 21, 'peptic ulcer disease': 22, 'diabetes': 23}


### Tokenization

In [6]:
import spacy

nlp = spacy.load("en_core_web_lg") 

def preprocess(text):
    list =[]
    for token in nlp(text):
        if token.is_space or token.is_punct:
            continue
        list.append(token.lemma_)
    return ' '.join(list)

In [7]:
df['preprocess'] = df['text'].apply(preprocess)

### Vector Embedding

In [8]:
df['vector'] = df.preprocess.apply(lambda text: nlp(text).vector)
df.head()

Unnamed: 0,label,text,label_num,preprocess,vector
0,Psoriasis,I have been experiencing a skin rash on my arm...,0,I have be experience a skin rash on my arm leg...,"[-1.0520097, 2.0068107, -3.1425354, 1.3052415,..."
1,Psoriasis,"My skin has been peeling, especially on my kne...",0,my skin have be peel especially on my knee elb...,"[-1.3448839, 1.2253065, -4.0693727, 0.5828706,..."
2,Psoriasis,I have been experiencing joint pain in my fing...,0,I have be experience joint pain in my finger w...,"[-0.039949566, 1.4880179, -3.136055, 0.0100825..."
3,Psoriasis,"There is a silver like dusting on my skin, esp...",0,there be a silver like dust on my skin especia...,"[-1.5876127, 1.5015007, -3.8816297, 1.3765275,..."
4,Psoriasis,"My nails have small dents or pits in them, and...",0,my nail have small dent or pit in they and the...,"[-0.80246216, 2.5354614, -4.1618123, -0.066844..."


## Model

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['vector'].values,df['label_num'],test_size= 0.20 , random_state = 23 , stratify= df['label_num'])

In [10]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

#### Feature Scaling

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train_2d)
X_test_scaled = scaler.transform(X_test_2d)

#### Model Training - Support Vector Classifier

In [12]:
from sklearn.svm import SVC

model = SVC(C=1, kernel='poly', degree=3, gamma="scale")
    
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [13]:
y_pred

array([ 8, 15,  6,  4,  6,  3, 10,  8,  3,  7,  2, 21, 19, 16,  5, 20, 16,
       23,  2,  2, 10,  5, 22,  0, 14, 17,  7,  1, 23,  2, 17, 16, 19,  5,
        7,  1, 16,  0, 20, 14, 13, 17,  2,  6,  9, 13,  6,  3,  3,  1,  6,
        4, 19,  8,  0, 23,  3, 13, 15, 18, 19,  8,  1,  7, 20, 12,  7, 21,
       19, 12,  3,  9,  6, 11, 22, 13, 12, 15,  3,  3, 23,  5,  8, 13, 20,
       16,  4,  6, 23, 18, 16,  0, 19,  5, 21,  3,  1,  3, 10,  0, 15, 11,
       23, 15,  5,  9, 11, 11, 19, 12, 20,  9,  8,  2,  8, 17, 10, 14,  2,
        7,  2, 14, 10, 20, 14, 16, 21,  9,  2, 12, 22, 17,  4, 12, 14, 15,
       10, 17, 11, 14, 13,  1, 19, 22, 13,  7, 17, 15, 18, 11,  9, 12, 18,
       16, 13, 12,  1, 11, 22,  9, 10,  7, 18,  0,  6, 17,  4, 22,  6, 10,
       18, 22, 14, 20,  1, 17, 20, 10, 20, 20, 23, 21,  6,  9,  8, 16, 16,
       20,  3, 15, 23, 21, 19, 11,  6, 14,  1, 14, 23, 21,  8, 18, 17,  4,
        1, 19, 15,  4,  4,  5, 10, 18, 17, 16, 22,  0,  0,  9,  3,  0,  1,
        8, 18, 13,  4,  7

#### Model Evaluation

In [14]:
from sklearn.metrics import classification_report

print('\n\n\n Classification Report   :\n\n\n ' , classification_report(y_test,y_pred))




 Classification Report   :


                precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.91      1.00      0.95        10
           2       0.89      0.80      0.84        10
           3       0.83      1.00      0.91        10
           4       1.00      1.00      1.00        10
           5       1.00      0.80      0.89        10
           6       0.91      1.00      0.95        10
           7       1.00      0.90      0.95        10
           8       1.00      1.00      1.00        10
           9       1.00      1.00      1.00        10
          10       0.91      1.00      0.95        10
          11       1.00      1.00      1.00        10
          12       1.00      1.00      1.00        10
          13       1.00      0.90      0.95        10
          14       0.82      0.90      0.86        10
          15       1.00      0.90      0.95        10
          16       0.91      1.00      0.95    

## Testing

In [15]:
t1 = "Bad cramps with bruise marks"

In [16]:
tp1 = preprocess(t1)

In [17]:
tp1 = nlp(tp1).vector

In [18]:
tp1 = tp1.reshape(1,-1)

In [19]:
tp1 = scaler.transform(tp1)

In [20]:
pred = model.predict(tp1)

In [21]:
print(pred)

[0]


In [22]:
value = [i for i in lookup if lookup[i]==pred[0]]
print(value[0])

Psoriasis


## Saving the model

In [23]:
import joblib
joblib.dump(model, 'model/model.pkl')
joblib.dump(scaler, 'model/scaler.pkl')

['scaler.pkl']

In [24]:
df.to_csv('datasets/lookup.csv')

In [25]:
df.text.iloc[34]

"My nails are starting to have small pits on them. I am worried and don't know what is causing it. Also, my joints pain and there are rashes on my arms and back."

In [26]:
df.text.iloc[34]

"My nails are starting to have small pits on them. I am worried and don't know what is causing it. Also, my joints pain and there are rashes on my arms and back."