In [89]:
from urllib.request import urlopen

data_set_url = 'https://static-content.springer.com/esm/art%3A10.1038%2Fncomms5212/MediaObjects/41467_2014_BFncomms5212_MOESM1045_ESM.txt'
data = urlopen(data_set_url)

In [90]:
my_data = []
for line in data:
    data_row = line.decode().rstrip()
    my_data.append([term for term in data_row.split('\t')])

In [113]:
import pandas as pd
df = pd.DataFrame.from_records(my_data[1:], columns = my_data[0])
df.columns

Index(['MeSH Symptom Term', 'MeSH Disease Term', 'PubMed occurrence',
       'TFIDF score'],
      dtype='object')

In [114]:
df.columns = ['symptom','disease','n','score']
df['symptom'] = df['symptom'].astype('category')
df['disease'] = df['disease'].astype('category')
df['n'] = df['n'].astype('int')
df['score'] = df['score'].astype('float')
df = df.sort_values(by=['disease', 'symptom', 'n', 'score'])
possible_diseases = set(df['disease'])
possible_symptoms = set(df['symptom'])
print('# Unique Diseases: {}'.format(len(possible_diseases)))
print('# Unique Symptoms: {}'.format(len(possible_symptoms)))

# Unique Diseases: 4219
# Unique Symptoms: 322


In [108]:
# df.to_csv('symptom_disease_dataset.csv', index=False)

In [115]:
df['symptom_encod'] = df['symptom'].cat.codes
df['disease_encod'] = df['disease'].cat.codes

In [110]:
import pandas as pd
df = pd.read_csv('symptom_disease_dataset.csv')
df.head()

Unnamed: 0,symptom,disease,n,score
0,Language Development Disorders,22q11 Deletion Syndrome,1,2.486567
1,Mental Retardation,22q11 Deletion Syndrome,1,0.905447
2,Olfaction Disorders,22q11 Deletion Syndrome,1,2.28823
3,Respiratory Sounds,22q11 Deletion Syndrome,1,1.639269
4,Virilism,"46, XX Disorders of Sex Development",1,2.227056


In [104]:
n = len(df)
diseases = list(set(df['disease']))
diseases_csv = pd.DataFrame([['disease_id'] + diseases])
diseases_csv.to_csv('disease_ids.csv', index=False)

# Disease To Symptom Classifier

## Path To Completion

1. Reformat data-frame to have symp_1, ..., symp_n, disease_i for each row, with the values being the occurence of symptom j within 1 <= j <= n for disease i.

2. Create simulated training and testing data for each disease which match original data's original frequency distribution

3. Run random forests model on data

## Step 1 - Reformat Data-Frame

In [116]:
symptom_encod = set(df['symptom_encod'])
new_col_names = [""]*len(symptom_encod)

for i, symp_i in enumerate(symptom_encod):
    new_col_names[i] = "symp_" + str(symp_i)

new_col_names.append("disease")

In [117]:
disease_encod = set(df['disease_encod'])
new_df = pd.DataFrame([[0]*len(new_col_names) for _ in range(len(disease_encod))], columns=new_col_names)

In [11]:
# test grabbing 'n'
df[(df.disease_encod == 1) & (df.symptom_encod == 312)][['n']].iloc[0]['n']

1

In [14]:
num_cols = len(new_df.columns)
def edit_row(rnge):
    start, end = rnge
    for disease in range(start, end+1):
        new_row = [0]*num_cols
        new_row[-1] = disease
        
        for j in range(num_cols-1):
            res = df[(df['disease_encod'] == disease) & (df['symptom_encod'] == j)]
            if len(res) > 0:
                new_row[j] = res[['n']].iloc[0]['n']

        # write over row in df
        new_df.iloc[disease] = new_row

In [1]:
# uncomment for re-processing
import threading

def start_threads(threads):
    for thread in threads:
        thread.start()

def join_threads(threads):
    for thread in threads:
        thread.join()

def multi_thread(n_rows, n_threads, proc):
    if n_threads < 1:
        return
    
    jump = n_rows // n_threads
    
    threads = []
    for i in range(0, n_rows, jump):
        # split range -> [start, end] --> feed to proc
        start, end = i, i+jump
        
        # extend for final thread
        if i // jump == n_threads:
            end = n_rows-1
            
        t = threading.Thread(target=proc, args=((start,end),))
        threads.append(t)
        
    start_threads(threads)
    join_threads(threads)

In [2]:
### Creates the re-formatted data-frame!
# multi_thread(len(new_df), 4, edit_row)

In [3]:
# uncomment when processing again
# new_df.to_csv('symptom_disease_dataset_reformatted.csv', index=False)

In [4]:
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_reformatted.csv')

# make columns floating point data types
for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


#X_train.reshape(-1,1)
def model_predict(clf, X_test):
    return clf.predict(X_test)

def print_accuracy(y_test, y_pred):
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))

    
def build_model(df):
    # capturing predictors and predicted 
    x_cols = df.columns[:-1].tolist()
    X = df[pd.Index(x_cols)]
    y = df['disease']

    # performing 80/20 split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    # initializingclf
    clf = RandomForestClassifier(n_estimators=100, max_depth=30)
    
    # Fitting model
    clf.fit(X_train, y_train)
    return clf, X_test, y_test
    

## Goal: Artificially Augment the data
- 1. Identify distribution of data
- 2. Randomly create new data and append to dataframe

In [None]:
import plotly.graph_objects as go

def bar_plot(x, y):
    fig = go.Figure([go.Bar(x=x,y=y)])
    fig.show()

def disease_symptom_dist(disease_id):
    disease = new_df[new_df['disease'] == disease_id]
    y_vals = disease[disease.columns[:-1]].iloc[0].tolist()
    
    # create bar chart
    bar_plot(disease.columns[:-1], y_vals)

In [None]:
import numpy as np
def apply_noise(df, mu=0, sigma=0.1, round=True):
    n_rows, n_cols = len(df), len(df.columns)
    noise = np.random.normal(mu, sigma, [n_rows, n_cols])
    return df + noise

In [None]:
# not good enough
# capture mu and sigma for each symptom
def get_mu(col):
    return sum(col)/len(col) if len(col) else 0

def get_sigma(col, mu=0.1):
    n = len(col)
    return sum((col-mu)**2)/n

In [None]:
# overwrite new_df by taking row proportions
def row_proportion(row):
    row_sum = sum(row[:-1])
    for i, item in enumerate(row[:-1]):
        row[i] = item*1.0 / (row_sum if row_sum else 1)
    return row

In [None]:
def apply_row_proportion(bounds):
    start, end = bounds
    for i in range(start, end+1):
        new_df.iloc[i] = row_proportion(new_df.iloc[i])

In [None]:
import time

tik = time.clock()
### Applies row proportions
# multi_thread(len(new_df), 5, apply_row_proportion)
tok = time.clock()
print('Total time: {}'.format(tok-tik))

In [None]:
# new_df.to_csv('symptom_disease_dataset_row_proportions.csv', index=False)

In [None]:
# import row proportions
#
# EXECUTE THIS DOWN FOR ARTIFICAL DATA CREATION
#
#
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_row_proportions.csv')

for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')
    
new_df.head()

## Row-proportions complete!

Now what?

Artificial data creation...

Let's say for a given disease, we have proportions: `[0.1, 0.2, 0.7]`.

We randomly sample a number between 1-1000 for each symptom, we get: `[100, 250, 400]`

Now apply the proportions: `[10, 50, 280]`

Recalculate the row-proportions: `[.027, 0.205, .788]` This is our new row of data.

In [None]:
import random

def get_artificial_row(row):
    n = len(row)
    artificial_row = [0.0]*n
    
    # create new artificial counts
    for i, actual_proportion in enumerate(row[:-1]):
        artificial_row[i] = random.randint(0,1000) * actual_proportion
    
    # add disease to artificial row
    artificial_row[-1] = row[-1]
    
    return row_proportion(artificial_row)

In [None]:
# looking good, let's multithread and enhance!!

# create massive data frame, where each piece of data has 500 additional rows of artificial data
N_ADDITIONAL = 200

# stretch new_df
old_n_rows, old_n_cols = len(new_df), len(new_df.columns)
new_n_rows = old_n_rows + old_n_rows * N_ADDITIONAL


def apply_artificial_data(bounds):
    print(bounds)
    start, end = bounds
    # Bounds is (0, 838019/5)
    # need to skip N_ADDITIONAL at a time
    i = start
    while i < end:
        print(i//(N_ADDITIONAL+1))
        og_index = i//(N_ADDITIONAL+1)
        # assign initial row
        big_data.iloc[i] = new_df.iloc[og_index]
        
        # add N_ADDITIONAL new rows of data
        j = 1
        while i+j < end and j < N_ADDITIONAL+1:
            big_data.iloc[i+j] = get_artificial_row(new_df.iloc[og_index])
            j += 1
        
        # move ahead N_ADDITIONAL
        i += N_ADDITIONAL+1

In [None]:
#test = pd.DataFrame([[0]*len(new_df.columns) for _ in range(10)], columns=new_df.columns)
#multi_thread(len(test), 1, apply_artificial_data)

In [3]:
def print_rows(df, bounds):
    start, end = bounds
    for row_idx in range(start, end):
        print_row(df.iloc[row_idx])

def print_row(row):
    for item in row:
        print(item)

In [4]:
# create 10 big data csvs
"""
big_data = [[0]*old_n_cols for _ in range(new_n_rows)]
i, jump = 0, len(big_data)//10

big_data_1 = big_data.iloc[i:i+jump]
i += jump
big_data_2 = big_data.iloc[i:i+jump]
i += jump
big_data_3 = big_data.iloc[i:i+jump]
i += jump
big_data_4 = big_data.iloc[i:i+jump]
i += jump
big_data_5 = big_data.iloc[i:i+jump]
i += jump
big_data_6 = big_data.iloc[i:i+jump]
i += jump
big_data_7 = big_data.iloc[i:i+jump]
i += jump
big_data_8 = big_data.iloc[i:i+jump]
i += jump
big_data_9 = big_data.iloc[i:i+jump]
i += jump
big_data_10 = big_data.iloc[i:]
i += jump
big_data_1.to_csv('big_data_1.csv')
big_data_2.to_csv('big_data_2.csv')
big_data_3.to_csv('big_data_3.csv')
big_data_4.to_csv('big_data_4.csv')
big_data_5.to_csv('big_data_5.csv')
big_data_6.to_csv('big_data_6.csv')
big_data_7.to_csv('big_data_7.csv')
big_data_8.to_csv('big_data_8.csv')
big_data_9.to_csv('big_data_9.csv')
big_data_10.to_csv('big_data_10.csv')
"""

"\nbig_data = [[0]*old_n_cols for _ in range(new_n_rows)]\ni, jump = 0, len(big_data)//10\n\nbig_data_1 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_2 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_3 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_4 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_5 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_6 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_7 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_8 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_9 = big_data.iloc[i:i+jump]\ni += jump\nbig_data_10 = big_data.iloc[i:]\ni += jump\nbig_data_1.to_csv('big_data_1.csv')\nbig_data_2.to_csv('big_data_2.csv')\nbig_data_3.to_csv('big_data_3.csv')\nbig_data_4.to_csv('big_data_4.csv')\nbig_data_5.to_csv('big_data_5.csv')\nbig_data_6.to_csv('big_data_6.csv')\nbig_data_7.to_csv('big_data_7.csv')\nbig_data_8.to_csv('big_data_8.csv')\nbig_data_9.to_csv('big_data_9.csv')\nbig_data_10.to_csv('big_data_10.csv')\n"

In [5]:
'''import pandas as pd
big_data_1 = pd.read_csv('big_data_1.csv')
big_data_2 = pd.read_csv('big_data_2.csv')
big_data_3 = pd.read_csv('big_data_3.csv')
big_data_4 = pd.read_csv('big_data_4.csv')
big_data_5 = pd.read_csv('big_data_5.csv')
big_data_6 = pd.read_csv('big_data_6.csv')
big_data_7 = pd.read_csv('big_data_7.csv')
big_data_8 = pd.read_csv('big_data_8.csv')
big_data_9 = pd.read_csv('big_data_9.csv')
big_data_10 = pd.read_csv('big_data_10.csv')

big_data = big_data_1.append(big_data_2, ignore_index=True).append(big_data_3, ignore_index=True).append(big_data_4, ignore_index=True).append(big_data_5, ignore_index=True).append(big_data_6, ignore_index=True).append(big_data_7, ignore_index=True).append(big_data_8, ignore_index=True).append(big_data_9, ignore_index=True).append(big_data_10, ignore_index=True)
del big_data['Unnamed: 0']'''

In [6]:
### Run this for artificial data creation!
# multi_thread(len(big_data), 1, apply_artificial_data)

NameError: name 'multi_thread' is not defined

In [8]:
import pandas as pd
big_data_1 = pd.read_csv('art_data_1.csv')
big_data_2 = pd.read_csv('art_data_2.csv')
big_data_3 = pd.read_csv('art_data_3.csv')
big_data_4 = pd.read_csv('art_data_4.csv')
big_data_5 = pd.read_csv('art_data_5.csv')
big_data_6 = pd.read_csv('art_data_6.csv')
big_data_7 = pd.read_csv('art_data_7.csv')
big_data_8 = pd.read_csv('art_data_8.csv')
big_data_9 = pd.read_csv('art_data_9.csv')
big_data_10 = pd.read_csv('art_data_10.csv')

big_data = big_data_1.append(big_data_2, ignore_index=True).append(big_data_3, ignore_index=True).append(big_data_4, ignore_index=True).append(big_data_5, ignore_index=True).append(big_data_6, ignore_index=True).append(big_data_7, ignore_index=True).append(big_data_8, ignore_index=True).append(big_data_9, ignore_index=True).append(big_data_10, ignore_index=True)
del big_data['Unnamed: 0']

In [12]:
# TODO: execute after artificial data finishes!
NUM_FILES = 20
i, jump = 0, len(big_data)//NUM_FILES

big_data_1 = big_data.iloc[i:i+jump]
i += jump
big_data_2 = big_data.iloc[i:i+jump]
i += jump
big_data_3 = big_data.iloc[i:i+jump]
i += jump
big_data_4 = big_data.iloc[i:i+jump]
i += jump
big_data_5 = big_data.iloc[i:i+jump]
i += jump
big_data_6 = big_data.iloc[i:i+jump]
i += jump
big_data_7 = big_data.iloc[i:i+jump]
i += jump
big_data_8 = big_data.iloc[i:i+jump]
i += jump
big_data_9 = big_data.iloc[i:i+jump]
i += jump
big_data_10 = big_data.iloc[i:i+jump]
i += jump
big_data_11 = big_data.iloc[i:i+jump]
i += jump
big_data_12 = big_data.iloc[i:i+jump]
i += jump
big_data_13 = big_data.iloc[i:i+jump]
i += jump
big_data_14 = big_data.iloc[i:i+jump]
i += jump
big_data_15 = big_data.iloc[i:i+jump]
i += jump
big_data_16 = big_data.iloc[i:i+jump]
i += jump
big_data_17 = big_data.iloc[i:i+jump]
i += jump
big_data_18 = big_data.iloc[i:i+jump]
i += jump
big_data_19 = big_data.iloc[i:i+jump]
i += jump
big_data_20 = big_data.iloc[i:]

big_data_1.to_csv('art_data_1.csv')
big_data_2.to_csv('art_data_2.csv')
big_data_3.to_csv('art_data_3.csv')
big_data_4.to_csv('art_data_4.csv')
big_data_5.to_csv('art_data_5.csv')
big_data_6.to_csv('art_data_6.csv')
big_data_7.to_csv('art_data_7.csv')
big_data_8.to_csv('art_data_8.csv')
big_data_9.to_csv('art_data_9.csv')
big_data_10.to_csv('art_data_10.csv')
big_data_11.to_csv('art_data_11.csv')
big_data_12.to_csv('art_data_12.csv')
big_data_13.to_csv('art_data_13.csv')
big_data_14.to_csv('art_data_14.csv')
big_data_15.to_csv('art_data_15.csv')
big_data_16.to_csv('art_data_16.csv')
big_data_17.to_csv('art_data_17.csv')
big_data_18.to_csv('art_data_18.csv')
big_data_19.to_csv('art_data_19.csv')
big_data_20.to_csv('art_data_20.csv')


In [14]:
clf, X_test, y_test = build_model(big_data)

In [15]:
print_accuracy(y_test, model_predict(clf, X_test))

Accuracy: 0.7032617155255773


In [16]:
from joblib import dump, load
dump(clf, 'model-08-30.joblib')

['model-08-30.joblib']