<a href="https://colab.research.google.com/github/iamdsc/disease_diagnoser/blob/master/Symptom_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import re
import random
import operator

def read_glove_vecs(file):
    with open(file, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            word = line[0]
            words.add(word)
            word_to_vec_map[word] = np.array(line[1:],dtype=np.float64)
    return words, word_to_vec_map

words, word_to_vec_map = read_glove_vecs("drive/My Drive/glove.6B.50d.txt")

In [0]:
def cosine_similarity(x, y):
    dot = np.dot(x,y)
    norm_x = np.sqrt(np.sum(x**2))
    norm_y = np.sqrt(np.sum(y**2))
    cosine_similarity = dot/(norm_x * norm_y)
    return cosine_similarity

In [0]:
df = pd.read_excel('raw_data.xlsx').drop('Count of Disease Occurrence', axis=1).fillna(method='ffill')

### Pre-processing the data to required format

In [0]:
df.Symptom = df.Symptom.map(lambda x: re.sub('^.*_','',x))
df.Disease = df.Disease.map(lambda x: re.sub('^.*_','',x))

df.Symptom = df.Symptom.map(lambda x: x.lower())
df.Disease = df.Disease.map(lambda x: x.lower())

# makes words like 'pain/swelling' into 'pain swelling'
df.Symptom = df.Symptom.map(lambda x: re.sub('(.*)\/(.*)', r'\1 \2', x))
df.Disease = df.Disease.map(lambda x: re.sub('(.*)\/(.*)', r'\1 \2', x))

# gets rid of parenthesised words
df.Symptom = df.Symptom.map(lambda x: re.sub('(.*)\(.*\)(.*)', r'\1\2', x))
df.Disease = df.Disease.map(lambda x: re.sub('(.*)\(.*\)(.*)', r'\1\2', x))

# gets rid of apostrophes and tokens of the sort '\xa0'
df.Symptom = df.Symptom.map(lambda x: re.sub('\'', '', x))
df.Disease = df.Disease.map(lambda x: re.sub('\'', '', x))
df.Disease = df.Disease.map(lambda x: re.sub('\\xa0', ' ', x))

In [6]:
df.head()

Unnamed: 0,Disease,Symptom
0,hypertensive disease,pain chest
1,hypertensive disease,shortness of breath
2,hypertensive disease,dizziness
3,hypertensive disease,asthenia
4,hypertensive disease,fall


In [7]:
#words in dataset that don't have a glove representation
#are being removed
counts = {}
def remove(x):
    for i in x.split():
        if not i in word_to_vec_map.keys():
            counts[i] = counts.get(i,0)+1

df.Symptom.map(lambda x:remove(x))
df.Disease.map(lambda x:remove(x))

0       None
1       None
2       None
3       None
4       None
        ... 
1861    None
1862    None
1863    None
1864    None
1865    None
Name: Disease, Length: 1866, dtype: object

In [0]:
#keeping unrepresented words in seperate dataframe
unrepresented_words = pd.DataFrame()
unrepresented_words['Words'] = counts.keys()
unrepresented_words['No. of Occurences'] = counts.values()
unrepresented_words.to_csv('Unrepresented_Words.csv')

### Reorganize the dataframe by grouping data by symptoms

In [9]:
frame = pd.DataFrame(df.groupby(['Symptom','Disease']).size()).drop(0,axis=1)
frame = frame.iloc[1:]
frame = frame.reset_index().set_index('Symptom')
frame

Unnamed: 0_level_0,Disease
Symptom,Unnamed: 1_level_1
abdomen acute,ileus
abdominal bloating,oralcandidiasis
abdominal tenderness,bacteremia
abdominal tenderness,dementia
abdominal tenderness,diverticulitis
...,...
yellow sputum,chronic kidney failure
yellow sputum,embolism pulmonary
yellow sputum,hepatitis b
yellow sputum,pneumocystis carinii pneumonia


In [0]:
#get how many times symptoms occur in dataset
counts = {}
for i in frame.index:
    counts[i] = counts.get(i, 0) + 1
#print(counts)

In [0]:
# Sorting symptoms by count and saving into dataframe
sym, ct = zip(*sorted(counts.items(), key=operator.itemgetter(1), reverse=True))
sym_count = pd.DataFrame()
sym_count['Symptom'] = sym
sym_count['Count'] = ct
sym_count.to_csv('Symptom_Counts.csv')

In [0]:
# drop symptoms with fewer than 6 entries in the dataset
for i in frame.index:
    if counts[i] < 6:
        #print(i)
        try:
            frame.drop(i, inplace=True)
        except:
            pass

In [13]:
# putting all diseases in dataset into a list
lst = []
frame.Disease.map(lambda x:lst.append(x))
#print(lst)

Symptom
abdominal tenderness    None
abdominal tenderness    None
abdominal tenderness    None
abdominal tenderness    None
abdominal tenderness    None
                        ... 
yellow sputum           None
yellow sputum           None
yellow sputum           None
yellow sputum           None
yellow sputum           None
Name: Disease, Length: 1200, dtype: object

In [0]:
"""
To train our own word embeddings on top of the existing GloVe model,
we will use a skipgram model. Each symptom has a disease associated with it,
and we use this as the (target word, context word) pair for skipgram generation.
We make a list that stores the pair and its corresponding label of 1, if the
disease is indeed associated with the symptom, and 0 otherwise.
"""
couples_and_labels = []

for i in frame.index.unique():
    # list of context words
    a = list(frame.Disease.loc[i].values)
    for j in a:
        # randomly choosing non-context words
        non_context = random.choice(list(set(lst)^set(a)))
        # assigning labels
        couples_and_labels.append((i,j,1))
        couples_and_labels.append((i,non_context,0))

In [0]:
# shuffling the values
b = random.sample(couples_and_labels, len(couples_and_labels))
symptom, disease, label = zip(*b)

In [16]:
# assigning a number to each symptom and disease to be fed in keras
s1 = pd.Series(list(symptom))
s2 = pd.Series(list(disease))
dic = {}
for i,j in enumerate(s1.append(s2).unique()):
    dic[j] = i
symptoms = np.array(s1.map(dic), dtype='int32')
diseases = np.array(s2.map(dic), dtype='int32')
print(symptoms)
print(diseases)

[ 0  1  2 ... 49  5 10]
[100 101 102 ... 131 133 126]


In [0]:
# converting labels too into an array
labels = np.array(label, dtype='int32')

In [18]:
# creating new embedding matrix
lst = []

# size of vocabulary ie no. of unique words in corpus
vocab_size = len(dic)

# dimension of word embeddings
vector_dim = 50

# create an array of zeros of shape (vocab_size, vector_dim)
embedding_matrix = np.zeros((len(dic),50))

for word, index in dic.items():
    for i in word.split():
        try:
            lst.append(word_to_vec_map[i])
        except:
            print(i)
    arr = np.array(lst)
    arrsum = arr.sum(axis=0)
    arrsum = arrsum/np.sqrt((arrsum**2).sum())
    embedding_matrix[index,:] = arrsum

transaminitis
orthopnea
rhonchus
prostatism
apyrexial
hemodynamically
hypokinesia
pleuritic
weepiness
thrombocytopaenia
deglutition
oralcandidiasis
exanthema
decubitus


In [19]:
# Training skipgram model
from keras.preprocessing import sequence
from keras.layers import Input, Dot, Reshape, Dense, Embedding
from keras.models import Model

input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(input_dim=vocab_size, output_dim=vector_dim,
                      input_length=1, name='embedding', trainable=True)

# loading pretrained embeddings
embedding.build((None,))
embedding.set_weights([embedding_matrix])

context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)

dot = Dot(axes=1)([context, target])
dot = Reshape((1,))(dot)

out = Dense(1, activation='sigmoid')(dot)

Using TensorFlow backend.













In [26]:
# create model instance
model = Model(input = [input_context, input_target], output = out)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['acc'])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 50)        11700       input_2[0][0]                    
                                                                 input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 50, 1)        0           embedding[0][0]            

  """Entry point for launching an IPython kernel.


In [27]:
model.fit(x=[symptoms, diseases], y=labels, epochs=25, validation_split=0.1)

Train on 2160 samples, validate on 240 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f8bee5185f8>

In [0]:
# get the new weights (embeddings) after running through keras
new_vecs = model.layers[2].get_weights()[0]

In [29]:
similarity_score = 0.6

d = pd.read_csv('/Dictionary.csv')
dic = {}
for i in d.index:
  dic[d.Key.loc[i]] = d.Values.loc[i]

symp = input('Enter symptom for which similar symptoms are to be found:')
print('\nThe similar symptoms are:')

for i in set(symptom):
  if (cosine_similarity(new_vecs[dic[i]],new_vecs[dic[symp]]))>similarity_score:
    if i!=symp:
      print(i)

Enter symptom for which similar symptoms are to be found:pain chest

The similar symptoms are:
sleeplessness
rale
