In [1]:
from urllib.request import urlopen

data_set_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fncomms5212/MediaObjects/41467_2014_BFncomms5212_MOESM1045_ESM.txt'
data = urlopen(data_set_url)

In [2]:
my_data = []
for line in data:
    data_row = line.decode().rstrip()
    my_data.append([term for term in data_row.split('\t')])

In [3]:
import pandas as pd
df = pd.DataFrame.from_records(my_data[1:], columns = my_data[0])
df.columns

Index(['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence',
       'TFIDF score'],
      dtype='object')

In [4]:
df.columns = ['symptom','disease','n','score']
df['symptom'] = df['symptom'].astype('category')
df['disease'] = df['disease'].astype('category')
df['n'] = df['n'].astype('int')
df['score'] = df['score'].astype('float')
df.dtypes

symptom    category
disease    category
n             int64
score       float64
dtype: object

In [5]:
# sort data frame by disease term
df = df.sort_values(by=['disease', 'symptom', 'n', 'score'])

In [6]:
possible_diseases = set(df['disease'])
possible_symptoms = set(df['symptom'])
print('# Unique Diseases: {}'.format(len(possible_diseases)))
print('# Unique Symptoms: {}'.format(len(possible_symptoms)))

# Unique Diseases: 4219
# Unique Symptoms: 322


In [7]:
df.to_csv('symptom_disease_dataset.csv', index=False)

In [8]:
df['symptom_encod'] = df['symptom'].cat.codes
df['disease_encod'] = df['disease'].cat.codes

In [6]:
import pandas as pd
df = pd.read_csv('symptom_disease_dataset.csv')
df.head()

Unnamed: 0,symptom,disease,n,score
0,Language Development Disorders,22q11 Deletion Syndrome,1,2.486567
1,Mental Retardation,22q11 Deletion Syndrome,1,0.905447
2,Olfaction Disorders,22q11 Deletion Syndrome,1,2.28823
3,Respiratory Sounds,22q11 Deletion Syndrome,1,1.639269
4,Virilism,"46, XX Disorders of Sex Development",1,2.227056


In [18]:
n = len(df)
diseases = list(set(df['disease']))
diseases_csv = pd.DataFrame([['disease_id'] + diseases])
diseases_csv.to_csv('disease_ids.csv', index=False)

# Disease To Symptom Classifier

## Path To Completion

1. Reformat data-frame to have symp_1, ..., symp_n, disease_i for each row, with the values being the occurence of symptom j within 1 <= j <= n for disease i.

2. Create simulated training and testing data for each disease which match original data's original frequency distribution

3. Run random forests model on data

## Step 1 - Reformat Data-Frame

In [9]:
symptom_encod = set(df['symptom_encod'])
new_col_names = [""]*len(symptom_encod)

for i, symp_i in enumerate(symptom_encod):
    new_col_names[i] = "symp_" + str(symp_i)

new_col_names.append("disease")

In [10]:
disease_encod = set(df['disease_encod'])
new_df = pd.DataFrame([[0]*len(new_col_names) for _ in range(len(disease_encod))], columns=new_col_names)
new_df

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# test grabbing 'n'
df[(df.disease_encod == 1) & (df.symptom_encod == 312)][['n']].iloc[0]['n']

1

In [12]:
# iterate over diseases and add to columns
"""
Too slow!
num_cols = len(new_df.columns)
for i, disease in enumerate(set(df['disease_encod'])):
    new_row = [0]*num_cols
    new_row[-1] = disease
    
    # add all occurrences of symp_j which corresponds with disease i
    for j in range(num_cols-1):
        res = df[(df['disease_encod'] == disease) & (df['symptom_encod'] == j)]
        if len(res) > 0:
            new_row[j] = res[['n']].iloc[0]['n']
    
    # write over row in df
    new_df.iloc[i] = new_row
"""

"\nToo slow!\nnum_cols = len(new_df.columns)\nfor i, disease in enumerate(set(df['disease_encod'])):\n    new_row = [0]*num_cols\n    new_row[-1] = disease\n    \n    # add all occurrences of symp_j which corresponds with disease i\n    for j in range(num_cols-1):\n        res = df[(df['disease_encod'] == disease) & (df['symptom_encod'] == j)]\n        if len(res) > 0:\n            new_row[j] = res[['n']].iloc[0]['n']\n    \n    # write over row in df\n    new_df.iloc[i] = new_row\n"

In [13]:
len(new_df)

4219

In [14]:
num_cols = len(new_df.columns)
def edit_row(rnge):
    start, end = rnge
    for disease in range(start, end+1):
        new_row = [0]*num_cols
        new_row[-1] = disease
        
        for j in range(num_cols-1):
            res = df[(df['disease_encod'] == disease) & (df['symptom_encod'] == j)]
            if len(res) > 0:
                new_row[j] = res[['n']].iloc[0]['n']

        # write over row in df
        new_df.iloc[disease] = new_row

In [1]:
# uncomment for re-processing
import threading

def start_threads(threads):
    for thread in threads:
        thread.start()

def join_threads(threads):
    for thread in threads:
        thread.join()

def multi_thread(df, n_threads, proc):
    if n_threads < 1:
        return
    
    n_row = len(df)
    jump = n_row // n_threads
    
    threads = []
    for i in range(0, n_row, jump):
        # split range -> [start, end] --> feed to proc
        start, end = i, i+jump
        
        # extend for final thread
        if i // jump == n_threads:
            end = n_row-1
            
        t = threading.Thread(target=proc, args=((start,end),))
        threads.append(t)
    
    start_threads(threads)
    join_threads(threads)
"""
new_row_cnt = len(new_df)
divisor = 30
split_cnt = int(new_row_cnt/ divisor)

threads = []
for i in range(0,new_row_cnt,split_cnt):
    start, end = i, i+split_cnt
    if i // split_cnt == divisor:
        # extend for final thread
        end = new_row_cnt
    
    t = threading.Thread(target=edit_row, args=((start,end),))
    threads.append(t)
    
threads
"""

'\nnew_row_cnt = len(new_df)\ndivisor = 30\nsplit_cnt = int(new_row_cnt/ divisor)\n\nthreads = []\nfor i in range(0,new_row_cnt,split_cnt):\n    start, end = i, i+split_cnt\n    if i // split_cnt == divisor:\n        # extend for final thread\n        end = new_row_cnt\n    \n    t = threading.Thread(target=edit_row, args=((start,end),))\n    threads.append(t)\n    \nthreads\n'

In [17]:
# uncomment to run threads again (takes >= 1 hr)
"""
start_threads(threads)
join_threads(threads)
"""

'\nstart_threads(threads)\njoin_threads(threads)\n'

In [18]:
#new_df

In [19]:
# uncomment when processing again
#new_df.to_csv('symptom_disease_dataset_reformatted.csv', index=False)

In [5]:
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_reformatted.csv')

# make columns floating point data types
for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')
    
new_df.head()

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4


In [9]:
from sklearn.model_selection import train_test_split
import numpy as np

x_cols = new_df.columns[:-1].tolist()
X = new_df[pd.Index(x_cols)]
y = new_df['disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
clf = RandomForestClassifier(n_estimators=100)

#X_train.reshape(-1,1)
def model_predict(X_train, y_train, X_test):
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

def print_accuracy(y_test, y_pred):
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



## Goal: Artificially Augment the data
- 1. Identify distribution of data
- 2. Randomly create new data and append to dataframe

In [2]:
import plotly.graph_objects as go

def bar_plot(x, y):
    fig = go.Figure([go.Bar(x=x,y=y)])
    fig.show()

def disease_symptom_dist(disease_id):
    disease = new_df[new_df['disease'] == disease_id]
    y_vals = disease[disease.columns[:-1]].iloc[0].tolist()
    
    # create bar chart
    bar_plot(disease.columns[:-1], y_vals)

In [19]:
import numpy as np
def apply_noise(df, mu=0, sigma=0.1, round=True):
    n_rows, n_cols = len(df), len(df.columns)
    noise = np.random.normal(mu, sigma, [n_rows, n_cols])
    return df + noise

In [18]:
# not good enough
# capture mu and sigma for each symptom
def get_mu(col):
    return sum(col)/len(col) if len(col) else 0

def get_sigma(col, mu=0.1):
    n = len(col)
    return sum((col-mu)**2)/n

mu_x = sum(new_df['symp_9'])//len(new_df)
sig_x = (sum((new_df['symp_9']-mu_x)**2)/(len(new_df)))**0.5
print('avg: {}, std. dev: {}'.format(mu_x, sig_x))

avg: 0.0, std. dev: 0.026812462234127647


In [3]:
# overwrite new_df by taking row proportions
def row_proportion(row):
    row_sum = sum(row[:-1])
    for i, item in enumerate(row[:-1]):
        row[i] = item*1.0 / (row_sum if row_sum else 1)
    return row

In [7]:
#new_df = new_df.apply(lambda row: row_proportion(row))
def apply_row_proportion(bounds):
    start, end = bounds
    for i in range(start, end+1):
        new_df.iloc[i] = row_proportion(new_df.iloc[i])
    #new_df[start:end] = new_df[start:end].apply(lambda row : row_proportion(row))

In [10]:
import time

tik = time.clock()
multi_thread(new_df, 5, apply_row_proportion)
tok = time.clock()
print('Total time: {}'.format(tok-tik))

  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Total time: 108.406031


  """


In [13]:
new_df.head()

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,4.0


In [14]:
new_df.to_csv('symptom_disease_dataset_row_proportions.csv', index=False)

In [4]:
# import row proportions
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_row_proportions.csv')

for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')
    
new_df.head()

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,4.0


## Row-proportions complete!

Now what?

Artificial data creation...

Goal:
- Add additional rows of data to data frame which are similar to each row in the df.

How to create similar rows of data?

For a given disease `i`, we have `N` symptoms `0 <= j < N`.

For symptoms which have positive proportion, we should add some slight noise to its value.
For symptoms which have zero proportion, we should only slightly add some noise to its value.

Naive:
- positive proportions take noise of `mu = 0`,`sigma = 0.1`.
- zero prortions take noise of `mu = 0`, `sigma=0.01`

Or,
Get `mu_i` and `sigma_i` which represents the average proportion and standard deviation of all symptoms pertaining to disease `i`. Scramble each value accordingly? Lost here.


Nope, new goal:

- Generate a random number for each symptom, and multiply it by the proportion of that symptom for a given disease.
- Do this for each symptom, recalculate the row proportion and add to data frame


Let's say for a given disease, we have proportions: `[0.1, 0.2, 0.7]`.

We randomly sample a number between 1-1000 for each symptom, we get: `[100, 250, 400]`

Now apply the proportions: `[10, 50, 280]`

Recalculate the row-proportions: `[.027, 0.205, .788]` This is our new row of data.

In [6]:
import random

def get_artificial_row(row):
    n = len(row)
    artificial_row = [0.0]*n
    
    # create new artificial counts
    for i, actual_proportion in enumerate(row[:-1]):
        artificial_row[i] = random.randint(0,1000) * actual_proportion
    
    # add disease to artificial row
    artificial_row[-1] = row[-1]
    
    return row_proportion(artificial_row)

In [20]:
# looking good, let's multithread and enhance!!

# create massive data frame, where each piece of data has 500 additional rows of artificial data
N_ADDITIONAL = 200

# stretch new_df
old_n_rows, old_n_cols = len(new_df), len(new_df.columns)
new_n_rows = old_n_rows + old_n_rows * N_ADDITIONAL
#big_df = pd.DataFrame([[0]*old_n_cols for _ in range(new_n_rows)], columns=new_df.columns)


def apply_artificial_data(bounds):
    start, end = bounds
    for i in range(start, end+1):
        # add 500 new rows of data
        for j in range(1, N_ADDITIONAL + 1):
            big_df.iloc[i+j] = get_artificial_row(new_df.iloc[i])


In [24]:
new_n_rows

848019

In [None]:
big_df_1 = pd.DataFrame([[0]*old_n_cols for _ in range(new_n_rows//3)], columns=new_df.columns)
print('done with big df 1')
big_df_2 = pd.DataFrame([[0]*old_n_cols for _ in range(new_n_rows//3)], columns=new_df.columns)
print('done with big df 2')
big_df_3 = pd.DataFrame([[0]*old_n_cols for _ in range(new_n_rows//3)], columns=new_df.columns)
print('done with big df 3')

done with big df 1
done with big df 2


In [34]:
big_df = big_df_1.append(big_df_2, ignore_index=True).append(big_df_3,ignore_index=True)
big_df.head()

TypeError: append() takes no keyword arguments

In [29]:
big_df = pd.DataFrame(big_df_lst, columns=new_df.columns)
big2 = big_df
res = big_df.append(big2, ignore_index=True).append(big2, ignore_index=True)

In [30]:
res

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_313,symp_314,symp_315,symp_316,symp_317,symp_318,symp_319,symp_320,symp_321,disease
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import pandas as pd
big_data = pd.read_csv('big_data.csv')
big_data.head()