In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
disease = pd.read_excel(r'data.xls',sheet_name='disease_terms')
symp = pd.read_excel(r'data.xls',sheet_name='symptom_terms',columns={'Terms':'symptom'})
rel = pd.read_excel(r'data.xls',heet_name="dis_symp_relationships") #relationship dataset that is used to merge disease and symptoms

In [3]:
disease = disease.rename(columns={'Terms':'disease'})
symp = symp.rename(columns={'Terms':'symptoms'})

In [4]:
disease.head(5)

Unnamed: 0,rid,disease_cui,snomed_code,disease
0,1,C0343681,240562003,Pustular syphilide
1,2,C0027145,43153006,Myxedema (disorder)
2,3,C0156346,198253003,Endometriosis of rectovaginal septum and vagina
3,4,C0264604,51100001,Hypokinetic parkinsonian dysphonia
4,5,C0268982,72746005,Infertility due to incomplete spermatogenic ar...


In [5]:
symp.head(5)

Unnamed: 0,rid,symptom_cui,snomed_code,symptoms
0,1,C0007940,5168000,Sporotrichotic chancre
1,2,C0157725,201023009,Other specified pruritic conditions (disorder)
2,3,C0234544,66264000,Todd's paresis
3,4,C0231246,36440009,Failure to gain weight (finding)
4,5,C0427086,267078001,Involuntary movement


In [6]:
rel.head(5)

Unnamed: 0,rid,disease_cui,symptom_cui
0,1,C0001175,C1720342
1,898,C0001175,C1720231
2,1167,C0001175,C1719857
3,2,C0001314,C0232597
4,308,C0001314,C0585542


In [7]:
comb = rel.merge(disease,on='disease_cui',how='left')

In [8]:
comb = comb.merge(symp,on='symptom_cui',how='left')

In [9]:
comb.dropna(inplace=True) #dropping null values

In [10]:
import itertools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
import re
def cleanup(s):
    
    s = str(s)
    s = s.lower()
    s = re.sub('rt*.@\w+',' ',s)
    s = re.sub('@\w+',' ',s)
    s = re.sub('\W,\s',' ',s)
    s = re.sub(r'[^\w,]', ' ', s)
    s = re.sub("\d+", "", s)
    s = re.sub('\s+',' ',s)
    s = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', s)
    s = s.replace(".co","")
    s = s.replace(",","")
    s = s.replace("[\w*"," ")
    s = s.replace('symptom','') # remove symptom word attached in text
    s = ''.join(''.join(a)[:2] for _, a in itertools.groupby(s)) #changing words like cooool to cool
    word_tokens = word_tokenize(s)
    s = [w for w in word_tokens if not w in stop_words] #stopwords removal
    s = " ".join(s)
    s = re.sub(r'\b\w{1,2}\b','', s) #remove words having length less than 3
    return s

In [11]:
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemme(s):
    a = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(s))]
    return " ".join(a)

In [12]:
comb['clean'] = comb.symptoms.apply(cleanup) # cleaning symptoms texts

In [13]:
comb['lemme'] = comb.clean.apply(lemme) # performing lemmatizer so that ambiguity can be removed
                                        #So that root meaning can be used

In [14]:
comb = comb[['disease','lemme']]  #keeping only disease and lemmatise text
comb = comb.rename(columns={'lemme':'symptoms'})

In [15]:
comb.head(5)

Unnamed: 0,disease,symptoms
0,Acquired immune deficiency syndrome (AIDS) (di...,fatigue associate aid
1,Acquired immune deficiency syndrome (AIDS) (di...,malaise associate aid
2,Acquired immune deficiency syndrome (AIDS) (di...,hepatomegaly associate aid
3,Acute disease,acute vomit
4,Acute disease,benign paroxysmal positional vertigo nystagmus


### We have different symptoms for one disease So, I am making dictionary to combine all the symptoms

In [16]:
new = dict.fromkeys(set(comb.disease),None) 

In [17]:
for i in range(comb.shape[0]):
    if comb.iloc[i].disease in new:
        if new.get(comb.iloc[i].disease) is None:
            new[comb.iloc[i].disease] = comb.iloc[i].symptoms
        else:
            new[comb.iloc[i].disease] = new.get(comb.iloc[i].disease) + ',' + comb.iloc[i].symptoms

In [19]:
temp = pd.DataFrame.from_dict(data=new,orient='index').rename(columns={0:'symptoms'})
temp['disease'] = temp.index
temp.index = range(temp.shape[0])

In [23]:
temp = temp[['disease','symptoms']]
temp.head(5)

Unnamed: 0,disease,symptoms
0,Malignant vasovagal syndrome,"malarial pigment deposition,intermittent fever"
1,(Disturbance of consciousness) or (faint/synco...,"syncope,syncope collapse"
2,Blind or low vision - one eye only,"syncope,syncope collapse,vasovagal,disturbance..."
3,Irritable bowel syndrome characterized by cons...,"unqualified visual loss one eye,visual loss on..."
4,Functional abdominal pain syndrome,constipation disorder


### I am using TFIDF as it povide score to words based on their occurence and will help me find similarity between two symptoms

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vec = TfidfVectorizer(ngram_range=(1,1),tokenizer=word_tokenize,
               min_df=2, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1,stop_words='english' )


In [25]:
vector = vec.fit_transform(temp.symptoms.values)

### lets fid out symptoms closer to headache

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
b = dict()
for i in range(vector.shape[0]):
    b['%s'%i] = cosine_similarity(vector[i],vec.transform(['headache'])) # storing dict with key as index number and 
                                                                        # value as cosine similarity
    
    

In [33]:
c = []
for i in b.keys():
    if b.get(i) > 0 and b.get(i) < 1: # getting those index number which have similarity less than 1 and greater than 0
        c.append(i)                   # as value equal to 1 will be only headache ones and while 0 one will not be having 
                                      # headache as symptoms

### Above two steps can be combine into one but I kept them separate to have clearity

### lets see results of cosine similarity

In [57]:
import operator
sorted(b.items(), key=operator.itemgetter(1),reverse=True)[:100] # sorting the results

[('18', array([[1.]])),
 ('87', array([[1.]])),
 ('100', array([[1.]])),
 ('108', array([[1.]])),
 ('120', array([[1.]])),
 ('134', array([[1.]])),
 ('268', array([[1.]])),
 ('269', array([[1.]])),
 ('270', array([[1.]])),
 ('324', array([[1.]])),
 ('344', array([[1.]])),
 ('372', array([[1.]])),
 ('376', array([[1.]])),
 ('484', array([[1.]])),
 ('488', array([[1.]])),
 ('495', array([[1.]])),
 ('515', array([[1.]])),
 ('578', array([[1.]])),
 ('609', array([[1.]])),
 ('625', array([[1.]])),
 ('673', array([[1.]])),
 ('680', array([[1.]])),
 ('686', array([[1.]])),
 ('709', array([[1.]])),
 ('749', array([[1.]])),
 ('755', array([[1.]])),
 ('772', array([[1.]])),
 ('799', array([[1.]])),
 ('801', array([[1.]])),
 ('825', array([[1.]])),
 ('840', array([[1.]])),
 ('1011', array([[1.]])),
 ('1015', array([[1.]])),
 ('1040', array([[1.]])),
 ('1042', array([[1.]])),
 ('1074', array([[1.]])),
 ('1075', array([[1.]])),
 ('1093', array([[1.]])),
 ('1112', array([[1.]])),
 ('1140', array([[1

In [45]:
print(temp.iloc[18].symptoms) # where headache is the only symptom
print(temp.iloc[100].symptoms )

headache find
headache find


In [47]:
print(temp.iloc[10].symptoms) # no relation to headache
print(temp.iloc[13].symptoms )

newborn physiological jaundice disorder
pain finding


In [50]:
temp.iloc[c].symptoms # symptoms that are related to headache

14            pain finding,headache find,neuropathic pain
146           pain finding,headache find,neuropathic pain
150                    thunderclap headache,headache find
210                            pain finding,headache find
292     post dural puncture headache,headache find,rea...
356                      headache find,subjective vertigo
472     headache find,menopausal headache,idiopathic s...
608     post dural puncture headache,headache follow l...
779                      headache find,occipital headache
866     menopausal sleeplessness,menopausal,menopausal...
875     post dural puncture headache,headache follow l...
939                                   menopausal headache
1026    thunderclap headache,hemiplegia disorder,bruns...
1104               ophthalmoplegia disorder,headache find
1316                           pain finding,headache find
1338           post dural puncture headache,headache find
1594          pain finding,headache find,neuropathic pain
1667          

In [51]:
from collections import Counter

In [52]:
final = ",".join(temp.iloc[c].symptoms.values).split(',') # joining all the symptoms to count the occurences

### I am removing all the symptoms that contain 'headache' word in them as they are also type of headache and our goal is to find different symptoms related to headache 
#### symtpoms like 'post dural puncture headache'  will be removed

In [53]:
ll =Counter([ x for x in final if 'headache' not in x])  # removing headache and counting the occurences

In [54]:
ll

Counter({'bruns nystagmus': 1,
         'hemiplegia disorder': 1,
         'menopausal': 1,
         'menopausal concentration lack': 1,
         'menopausal sleeplessness': 1,
         'neuropathic pain': 3,
         'ophthalmoplegia disorder': 1,
         'pain finding': 5,
         'reaction spinal lumbar puncture disorder': 3,
         'subjective vertigo': 1})

In [55]:
{k: v / len(ll) for k, v in ll.items()} # closeness of other symptoms to headache 

{'bruns nystagmus': 0.1,
 'hemiplegia disorder': 0.1,
 'menopausal': 0.1,
 'menopausal concentration lack': 0.1,
 'menopausal sleeplessness': 0.1,
 'neuropathic pain': 0.3,
 'ophthalmoplegia disorder': 0.1,
 'pain finding': 0.5,
 'reaction spinal lumbar puncture disorder': 0.3,
 'subjective vertigo': 0.1}

### So, according this dataset 'pain' is closet to headache which is quite true. we can increase the performance by taking larger datasets

### We can do this for all other symptoms. 