In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Emotion classification.ipynb',
 's2d_svc_model',
 'Symptom2Disease.csv',
 'Symptoms and Disease nlp model.ipynb',
 'Symptoms to Disease prediction with sklearn pipeline.ipynb',
 's_to_d_svc_model',
 'test.txt',
 'train.txt',
 'Untitled.ipynb',
 'val.txt']

In [3]:
df = pd.read_csv('Symptom2Disease.csv')
df

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...,...
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."


In [4]:
df.drop(columns='Unnamed: 0', inplace=True)

In [5]:
df.label.unique()

array(['Psoriasis', 'Varicose Veins', 'Typhoid', 'Chicken pox',
       'Impetigo', 'Dengue', 'Fungal infection', 'Common Cold',
       'Pneumonia', 'Dimorphic Hemorrhoids', 'Arthritis', 'Acne',
       'Bronchial Asthma', 'Hypertension', 'Migraine',
       'Cervical spondylosis', 'Jaundice', 'Malaria',
       'urinary tract infection', 'allergy',
       'gastroesophageal reflux disease', 'drug reaction',
       'peptic ulcer disease', 'diabetes'], dtype=object)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_num'] = le.fit_transform(df['label'])
df

Unnamed: 0,label,text,label_num
0,Psoriasis,I have been experiencing a skin rash on my arm...,15
1,Psoriasis,"My skin has been peeling, especially on my kne...",15
2,Psoriasis,I have been experiencing joint pain in my fing...,15
3,Psoriasis,"There is a silver like dusting on my skin, esp...",15
4,Psoriasis,"My nails have small dents or pits in them, and...",15
...,...,...,...
1195,diabetes,I'm shaking and trembling all over. I've lost ...,19
1196,diabetes,"Particularly in the crevices of my skin, I hav...",19
1197,diabetes,I regularly experience these intense urges and...,19
1198,diabetes,"I have trouble breathing, especially outside. ...",19


In [7]:
df.label_num.sort_values().unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

# Text Preprocessing

In [8]:
import spacy
from spacy.lang.en import STOP_WORDS
nlp = spacy.load("en_core_web_lg")

In [9]:
df.text[23]

"I'm having joint discomfort in my fingers, wrists, and knees. The pain is frequently aching and throbbing, and it worsens when I move my joints."

In [10]:
STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [11]:
def preprocess(text):
    list=[]
    for token in nlp(text):
        if token.is_punct or token.is_punct:
            continue
        list.append(token.lemma_.lower())
    list = [token for token in list if not token in STOP_WORDS]
    return ' '.join(list)

In [12]:
df['preprocessed'] = df['text'].apply(preprocess)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed'], df['label_num'], test_size=0.30,random_state=31, stratify=df['label_num'])

In [14]:
X_train

497     find incredibly difficult recently use restroo...
705                feel excessively hungry eat stiff neck
913     low temperatue foul smell pee area near kidney...
490     experience lot bowel movement issue recently d...
899     severe itching chill vomiting high fever sweat...
                              ...                        
425     trouble breathe uneasy throat fill lot phlegm ...
1076    chest pain extreme nausea present recently che...
364     sneeze lot feel tired sick lot gross stuff com...
6       skin mouth nose eye red inflame itchy uncomfor...
155     lymph node swollen cause discomfort neck armpi...
Name: preprocessed, Length: 840, dtype: object

# Making a sklearn pipeline

In [15]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer


In [16]:
from sklearn.pipeline import Pipeline
pipeLine = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svc',SVC())
])

In [17]:
pipeLine.fit(X_train,y_train)

In [18]:
pipeLine.score(X_test,y_test)

0.9666666666666667

# Saving the trained Model

In [19]:
import pickle

In [20]:
with open('s2d_svc_model','wb') as file:
    pickle.dump(pipeLine,file)

# Load the saved model

In [21]:
import os
os.listdir()

['.ipynb_checkpoints',
 'Emotion classification.ipynb',
 's2d_svc_model',
 'Symptom2Disease.csv',
 'Symptoms and Disease nlp model.ipynb',
 'Symptoms to Disease prediction with sklearn pipeline.ipynb',
 's_to_d_svc_model',
 'test.txt',
 'train.txt',
 'Untitled.ipynb',
 'val.txt']

In [22]:
with open('s2d_svc_model','rb') as file:
    s2d = pickle.load(file)

In [23]:
dummy_symptoms = ["Standing or walking for long periods of time causes a lot of pain in my legs. I get cramps upon doing physical activities. There are bruise marks on my legs too.",
                 "I've been feeling exhausted and weak, and I can't seem to get rid of it. Because of the vomiting and nausea, I've entirely lost my appetite. My belly pains which are causing me concern."]

In [24]:
preprocessed_dummy = pd.Series(dummy_symptoms)
preprocessed_dummy = preprocessed_dummy.apply(preprocess)
preprocessed_dummy

0    stand walk long period time cause lot pain leg...
1    feel exhausted weak rid vomiting nausea entire...
dtype: object

In [25]:
s2d.predict(preprocessed_dummy)

array([17, 16])

In [26]:
# for 1st symptom it is predicted as Varicose Veins
df[df['label_num']==17]['label'].unique()[0]

'Varicose Veins'

In [27]:
# for 2st symptom it is predicted as Typhoid
df[df['label_num']==16]['label'].unique()[0]

'Typhoid'