In [8]:
import pandas as pd
import numpy as np
import re

In [9]:
import glob
result = pd.read_csv('../disease-symptom-cuis.csv', encoding='utf-8', index_col=None, header=0)
result.head()


Unnamed: 0,Disease,Symptom
0,C0162565,C0039239
1,C0162565,C0000737
2,C0162565,C0235704
3,C0162565,C0030554
4,C0162565,C0030552


In [10]:
len(result['Disease'].unique())

303

In [11]:
def isValid(cui):
    cui = str(cui)
    pattern = re.compile("C\\d{7}")
    if not pattern.match(cui):
        return False
    return True

def cuiToNumber(cui):
      return cui.strip("C").strip("0")

def convertCUI(cui):
    cui = str(cui)
    if not isValid(cui):
        return "C" + cui.zfill(7)
    else:
        return cui

def clean(the_string):
    return str(the_string.encode('utf-8'))

In [12]:
result['Disease'] = result['Disease'].apply(convertCUI)
result['Symptom'] = result['Symptom'].apply(convertCUI)
result.head()

Unnamed: 0,Disease,Symptom
0,C0162565,C0039239
1,C0162565,C0000737
2,C0162565,C0235704
3,C0162565,C0030554
4,C0162565,C0030552


In [13]:
result.to_csv("../disease-symptom-cuis.csv",index=False)

In [51]:
len(result['Disease'].unique())

303

In [53]:
df_foreign = pd.read_csv('../disease-symptom-foreign-source.csv', encoding='utf-8', index_col=None, header=0)
df_foreign.head()

Unnamed: 0,Disease,Symptom
0,C0020538,C0008031
1,C0020538,C0392680
2,C0020538,C0012833
3,C0020538,C0004093
4,C0020538,C0085639


In [54]:
len(df_foreign['Disease'].unique())

133

In [64]:
result = result.append(df_foreign)
result.tail()

Unnamed: 0,Disease,Symptom
1861,C0233472,C0425251
1862,C0233472,C0242453
1863,C0011127,C0232257
1864,C0011127,C0871754
1865,C0011127,C0015967


In [65]:
len(result['Disease'].unique())

398

In [66]:
result['Disease'] = result['Disease'].astype(str)
result['Symptom'] = result['Symptom'].astype(str)

In [57]:
import sqlite3
import csv, codecs, cStringIO

class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f", 
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([unicode(s).encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

conn = sqlite3.connect('../flaskapp/databases/umls.db')

c = conn.cursor()
c.execute('select * from descriptions')

writer = UnicodeWriter(open("descriptions.csv", "wb"))

writer.writerows(c)

In [58]:
umlsDF = pd.read_csv('./descriptions.csv', encoding='utf-8', index_col=None, header=0)

In [59]:
umlsDF.columns = ['CUI', 'LAT', 'SAB', 'TTY', 'STR', 'STY']
umlsDF.head()

Unnamed: 0,CUI,LAT,SAB,TTY,STR,STY
0,C0000005,ENG,MSH,ET,(131)I-MAA,T116|T121|T130
1,C0000039,ENG,MTH,PN,"1,2-dipalmitoylphosphatidylcholine",T109|T121
2,C0000039,ENG,MSH,MH,"1,2-Dipalmitoylphosphatidylcholine",T109|T121
3,C0000039,ENG,MSH,PM,"1,2 Dipalmitoylphosphatidylcholine",T109|T121
4,C0000039,ENG,MSH,ET,"1,2-Dihexadecyl-sn-Glycerophosphocholine",T109|T121


In [99]:
def findConcept(cui, lat):
    results = umlsDF.loc[(umlsDF['CUI']==cui) & (umlsDF['LAT']==lat)]["STR"].unique()
    
    if len(results) >= 1:
        return results[0]
    else:
        return cui

In [100]:
import io, json

labelsDir = "../flaskapp/app/frontend/data/"
languages = ["GER", "ENG"]
convertIso = {"GER": "de", "ENG": "en"}
sy_cuis = result["Symptom"].unique()

for lat in languages:
    currentLatOut = []
    
    for sy in sy_cuis:
        currentLatOut.append({"label": findConcept(sy, lat), "value": sy})
        
    with io.open(labelsDir + convertIso[lat] + '_Labels.js', 'w', encoding='utf-8') as f:
        f.write("exports." + convertIso[lat] + "LABELS = " + json.dumps(currentLatOut, ensure_ascii=False) + ";")


In [67]:
result.apply(lambda x: pd.lib.infer_dtype(x.values))

Disease    string
Symptom    string
dtype: object

In [68]:
result['Symptom'].replace('', np.nan, inplace=True)
result.dropna(subset=['Symptom'], inplace=True)

In [69]:
result['Disease'].replace('', np.nan, inplace=True)
result.dropna(subset=['Disease'], inplace=True)

In [70]:
df = pd.DataFrame(result)

In [71]:
df.columns

Index([u'Disease', u'Symptom'], dtype='object')

In [72]:
df_1 = pd.get_dummies(df.Symptom)

In [73]:
df_1.columns

Index([u'C0000727', u'C0000731', u'C0000737', u'C0000786', u'C0000809',
       u'C0001122', u'C0001127', u'C0001824', u'C0001883', u'C0001925',
       ...
       u'C3203358', u'C3203485', u'C3203595', u'C3494422', u'C3714552',
       u'C3714614', u'C3714745', u'C3887611', u'C4049320', u'C4082299'],
      dtype='object', length=1016)

In [74]:
df_1.head()

Unnamed: 0,C0000727,C0000731,C0000737,C0000786,C0000809,C0001122,C0001127,C0001824,C0001883,C0001925,...,C3203358,C3203485,C3203595,C3494422,C3714552,C3714614,C3714745,C3887611,C4049320,C4082299
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
df_s = df['Disease']

In [76]:
df_pivoted = pd.concat([df_s,df_1], axis=1)

In [77]:
df_pivoted.drop_duplicates(keep='first',inplace=True)

In [78]:
df_pivoted[:5]

Unnamed: 0,Disease,C0000727,C0000731,C0000737,C0000786,C0000809,C0001122,C0001127,C0001824,C0001883,...,C3203358,C3203485,C3203595,C3494422,C3714552,C3714614,C3714745,C3887611,C4049320,C4082299
0,C0162565,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C0162565,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C0162565,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C0162565,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C0162565,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
len(df_pivoted)

4024

In [80]:
cols = df_pivoted.columns

In [81]:
cols = cols[1:] # skip 'Disease'
cols

Index([u'C0000727', u'C0000731', u'C0000737', u'C0000786', u'C0000809',
       u'C0001122', u'C0001127', u'C0001824', u'C0001883', u'C0001925',
       ...
       u'C3203358', u'C3203485', u'C3203595', u'C3494422', u'C3714552',
       u'C3714614', u'C3714745', u'C3887611', u'C4049320', u'C4082299'],
      dtype='object', length=1016)

In [82]:
df_pivoted = df_pivoted.groupby('Disease').sum()
df_pivoted = df_pivoted.reset_index()
df_pivoted.head(8)

Unnamed: 0,Disease,C0000727,C0000731,C0000737,C0000786,C0000809,C0001122,C0001127,C0001824,C0001883,...,C3203358,C3203485,C3203595,C3494422,C3714552,C3714614,C3714745,C3887611,C4049320,C4082299
0,C0001175,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C0001206,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C0001339,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C0001418,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C0001511,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,C0001824,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,C0001973,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,C0002390,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
import os
all_files_ml = "../data/all-files-for-ml"

df_pivoted.to_csv(os.path.join(all_files_ml, "all_pivoted.csv"), index=False)

In [84]:
cols = df_pivoted.columns
cols = cols[1:] # skip 'title'
x = df_pivoted[cols] # symptom rows
y = df_pivoted['Disease'] # diseases
x.to_csv(os.path.join(all_files_ml, "all_x.csv"), index=False)

In [85]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split



In [86]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [87]:
mnb = MultinomialNB()
mnb = mnb.fit(x_train, y_train)

In [88]:
mnb.score(x_test, y_test)

0.0

In [89]:
mnb_tot = MultinomialNB()
mnb_tot = mnb_tot.fit(x, y)

In [90]:
mnb_tot.score(x, y)

0.992462311557789

In [91]:
disease_pred = mnb_tot.predict(x)

In [92]:
disease_real = y.values

In [93]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))

Pred: C0036262 Actual:C0041976
Pred: C0039621 Actual:C0549346
Pred: C0020428 Actual:C1384514


In [94]:
from sklearn.externals import joblib
joblib.dump(mnb, os.path.join(all_files_ml, 'all_mnb.pkl'), protocol=2)

['../data/all-files-for-ml/all_mnb.pkl']

In [95]:
data = pd.read_csv(os.path.join(all_files_ml, "all_x.csv"))

In [96]:
df = pd.DataFrame(data)
cols = df.columns
features = cols # = symptoms
features_raw = [str(features[x]) for x in range(len(features))]
features_raw = ','.join(map(str, features_raw))


# convert feature array into dict of symptom: index
feature_dict = {}
for i,f in enumerate(features):
    feature_dict[f] = i

print (feature_dict)  

{'C0017152': 164, 'C0423600': 756, 'C0037580': 391, 'C0497156': 811, 'C0013456': 117, 'C0856054': 908, 'C0007758': 61, 'C0235231': 629, 'C0037036': 387, 'C0033860': 356, 'C0032961': 348, 'C0947912': 932, 'C0011253': 89, 'C0332601': 722, 'C0027726': 299, 'C0231530': 564, 'C1405524': 970, 'C0423602': 757, 'C0234866': 622, 'C0232995': 590, 'C0026635': 278, 'C0024312': 261, 'C0002878': 18, 'C0234238': 614, 'C2939175': 1004, 'C0234233': 613, 'C0234544': 621, 'C0002871': 17, 'C0020639': 226, 'C0920063': 929, 'C0009402': 74, 'C0231835': 568, 'C0035423': 375, 'C0221358': 554, 'C0032739': 346, 'C3714745': 1012, 'C0392156': 738, 'C0036980': 386, 'C0858924': 917, 'C0702266': 873, 'C0034880': 366, 'C0080274': 448, 'C0036454': 380, 'C1384606': 967, 'C0003486': 26, 'C0340464': 727, 'C0155765': 515, 'C0002170': 11, 'C0009763': 79, 'C0159060': 523, 'C0042510': 433, 'C0221204': 545, 'C0022504': 241, 'C0032781': 347, 'C0017658': 168, 'C0009443': 76, 'C0017979': 170, 'C1313921': 963, 'C1868945': 992, 'C0

In [97]:
def findFeatures(disease):
    return result.loc[result['Disease'] == disease]["Symptom"].values.astype(str)

In [98]:
sample = np.zeros((len(features),), dtype=np.int)
sampe = sample.tolist()

search = ["C0857794", "C0149793", "C0000786"]
for i,s in enumerate(search):
    sample[feature_dict[s]] = 1

sample = np.array(sample).reshape(1,len(sample))

results = mnb.predict_proba(sample)[0]


# gets a dictionary of {'class_name': probability}
prob_per_class_dictionary = dict(zip(mnb.classes_, results))

# gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class']
results_ordered_by_probability = map(lambda x: {"disease": x[0],"prop": x[1] * 100, "sy": findFeatures(x[0])}, sorted(zip(mnb.classes_, results), key=lambda x: x[1], reverse=True))


print (list(results_ordered_by_probability))


#store the predicted probabilities for class 1
y_pred_prob = mnb.predict_proba(sample)[0]


[{'sy': array(['C0000786', 'C0857794', 'C0149793', 'C0456909', 'C0494562',
       'C0036572', 'C0149931', 'C0238457', 'C0033687', 'C0020538',
       'C0040034', 'C0019054', 'C0002871', 'C0019080', 'C0034734',
       'C0085642', 'C0027051', 'C0013922', 'C0038454', 'C0000809'],
      dtype='|S8'), 'disease': 'C0085278', 'prop': 2.8497636286476187}, {'sy': array(['C0019054'], dtype='|S8'), 'disease': 'C0017551', 'prop': 0.3765609316280877}, {'sy': array(['C0236171'], dtype='|S8'), 'disease': 'C0017920', 'prop': 0.3765609316280877}, {'sy': array(['C0033774'], dtype='|S8'), 'disease': 'C0036262', 'prop': 0.3765609316280877}, {'sy': array(['C0401151'], dtype='|S8'), 'disease': 'C0221036', 'prop': 0.3765609316280877}, {'sy': array(['C0013384'], dtype='|S8'), 'disease': 'C0392549', 'prop': 0.3765609316280877}, {'sy': array(['C0038002'], dtype='|S8'), 'disease': 'C0948968', 'prop': 0.3765609316280877}, {'sy': array(['C0010200', 'C0024110'], dtype='|S8'), 'disease': 'C0006267', 'prop': 0.3754523