## Mini-Model (To exercise integration in web app)

#### Author: Konner Macias

## Import Utility functions from Main Workbook

In [34]:
# Multi-threading
import threading

def start_threads(threads):
    for thread in threads:
        thread.start()

def join_threads(threads):
    for thread in threads:
        thread.join()

def multi_thread(n_rows, n_threads, proc):
    if n_threads < 1:
        return
    
    jump = n_rows // n_threads
    
    threads = []
    for i in range(0, n_rows, jump):
        # split range -> [start, end] --> feed to proc
        start, end = i, i+jump
        
        # extend for final thread
        if i // jump == n_threads:
            end = n_rows-1
            
        t = threading.Thread(target=proc, args=((start,end),))
        threads.append(t)
        
    start_threads(threads)
    join_threads(threads)


# Pruning
import heapq

# time-complexity: O(N*max(M, log(N))) 
def prune_df(df, top_num_diseases):
    h = []
    
    # time-complexity: O(N*max(M, log(N))) 
    # O(N) - rows
    for i in range(len(df)):
        # O(M) - cols
        row_sum = sum(df.iloc[i][:-1])
        # O(log(N))
        heapq.heappush(h, (-row_sum, i))
    
    # capture top diseases based on symptom frequency
    # time-complexity: O(N) * O(1) = O(N)
    top_diseases = [heapq.heappop(h)[1] for i in range(top_num_diseases)]
    
    # create new data frame of top diseases
    top_df = df[df.disease.isin(top_diseases)]
    for col in top_df.columns[:-1]:
        # if symptom has 0 occurrences, delete
        if not top_df[col].any():
            del top_df[col]
    
    return top_df


# Modeling
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics


#X_train.reshape(-1,1)
def model_predict(clf, X_test):
    return clf.predict(X_test)

def print_accuracy(y_test, y_pred):
    print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))

    
def build_model(df):
    # capturing predictors and predicted 
    x_cols = df.columns[:-1].tolist()
    X = df[pd.Index(x_cols)]
    y = df['disease']

    # performing 80/20 split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    
    # initializing clf
    clf = RandomForestClassifier(n_estimators=100, max_depth=30)
    
    # Fitting model
    clf.fit(X_train, y_train)
    return clf, X_test, y_test


# row-proportions
def row_proportion(row):
    row_sum = sum(row[:-1])
    for i, item in enumerate(row[:-1]):
        row[i] = item*1.0 / (row_sum if row_sum else 1)
    return row

## Import Compressed Data

In [6]:
import pandas as pd
new_df = pd.read_csv('symptom_disease_dataset_reformatted.csv')

# make columns floating point data types
for col in new_df.columns[:-1]:
    new_df[col] = new_df[col].astype('float')

## Prune Dataframe & Reapply Row Proportions
Only consider the top 5 diseases for practice

In [7]:
top_5_df = prune_df(new_df, 5)
top_5_df

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_311,symp_312,symp_313,symp_314,symp_315,symp_316,symp_317,symp_320,symp_321,disease
216,2.0,3.0,109.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11.0,0.0,2.0,0.0,0.0,0.0,5.0,0.0,4.0,216
2447,1.0,0.0,0.0,5.0,0.0,2.0,3.0,1.0,8.0,2.0,...,0.0,2.0,103.0,1.0,1.0,2.0,23.0,7.0,9.0,2447
2781,1.0,10.0,7.0,0.0,2.0,2.0,0.0,0.0,1.0,110.0,...,3.0,6.0,10.0,2.0,1.0,0.0,25.0,1486.0,4836.0,2781
2908,69.0,76.0,2.0,0.0,0.0,1.0,2.0,0.0,7.0,0.0,...,13.0,0.0,21.0,0.0,0.0,4.0,110.0,10.0,26.0,2908
3533,1.0,3.0,0.0,0.0,0.0,0.0,6.0,0.0,4.0,1.0,...,10.0,0.0,30.0,0.0,0.0,0.0,40.0,11.0,6.0,3533


In [8]:
def apply_row_proportion(bounds):
    start, end = bounds
    for i in range(start, end+1):
        top_5_df.iloc[i] = row_proportion(top_5_df.iloc[i])

multi_thread(len(top_5_df), 1, apply_row_proportion)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
Exception in thread Thread-5:
Traceback (most recent call last):
  File "/home/ko

In [9]:
top_5_df

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_311,symp_312,symp_313,symp_314,symp_315,symp_316,symp_317,symp_320,symp_321,disease
216,0.000114,0.000171,0.00621,0.000114,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000627,0.0,0.000114,0.0,0.0,0.0,0.000285,0.0,0.000228,216.0
2447,4.9e-05,0.0,0.0,0.000244,0.0,9.8e-05,0.000147,4.9e-05,0.000391,9.8e-05,...,0.0,9.8e-05,0.005031,4.9e-05,4.9e-05,9.8e-05,0.001123,0.000342,0.00044,2447.0
2781,1.4e-05,0.000143,0.0001,0.0,2.9e-05,2.9e-05,0.0,0.0,1.4e-05,0.00157,...,4.3e-05,8.6e-05,0.000143,2.9e-05,1.4e-05,0.0,0.000357,0.021203,0.069004,2781.0
2908,0.001171,0.00129,3.4e-05,0.0,0.0,1.7e-05,3.4e-05,0.0,0.000119,0.0,...,0.000221,0.0,0.000357,0.0,0.0,6.8e-05,0.001867,0.00017,0.000441,2908.0
3533,4.6e-05,0.000138,0.0,0.0,0.0,0.0,0.000275,0.0,0.000184,4.6e-05,...,0.000459,0.0,0.001377,0.0,0.0,0.0,0.001836,0.000505,0.000275,3533.0


## Add Artificial Data

In [16]:
import random

def get_artificial_row(row):
    n = len(row)
    artificial_row = [0.0]*n
    
    # create new artificial counts
    for i, actual_proportion in enumerate(row[:-1]):
        artificial_row[i] = random.randint(0,1000) * actual_proportion
    
    # add disease to artificial row
    artificial_row[-1] = row[-1]
    
    return row_proportion(artificial_row)



# create massive data frame, where each piece of data has 500 additional rows of artificial data
N_ADDITIONAL = 200

# stretch new_df
old_n_rows, old_n_cols = len(top_5_df), len(top_5_df.columns)
new_n_rows = old_n_rows + old_n_rows * N_ADDITIONAL

# create big_data sparse dataframe to populate
big_data = pd.DataFrame([[0]*old_n_cols for _ in range(new_n_rows)], columns=top_5_df.columns)

def apply_artificial_data(bounds):
    start, end = bounds
    i = start
    while i < end:
        og_index = i//(N_ADDITIONAL+1)
        # assign initial row
        big_data.iloc[i] = top_5_df.iloc[og_index]
        
        # add N_ADDITIONAL new rows of data
        j = 1
        while i+j < end and j < N_ADDITIONAL+1:
            big_data.iloc[i+j] = get_artificial_row(top_5_df.iloc[og_index])
            j += 1
        
        # move ahead N_ADDITIONAL
        i += N_ADDITIONAL+1

In [18]:
multi_thread(len(big_data), 1, apply_artificial_data)

## Train Model

In [35]:
clf, X_test, y_test = build_model(big_data)

In [36]:
print_accuracy(y_test, model_predict(clf, X_test))

Accuracy: 1.0


In [24]:
big_data

Unnamed: 0,symp_0,symp_1,symp_2,symp_3,symp_4,symp_5,symp_6,symp_7,symp_8,symp_9,...,symp_311,symp_312,symp_313,symp_314,symp_315,symp_316,symp_317,symp_320,symp_321,disease
0,0.000114,0.000171,0.006210,0.000114,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000627,0.0,0.000114,0.0,0.0,0.0,0.000285,0.000000,2.278813e-04,216.0
1,0.000576,0.000403,0.002926,0.000305,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.002146,0.0,0.000325,0.0,0.0,0.0,0.001208,0.000000,1.398213e-04,216.0
2,0.000018,0.000080,0.008735,0.000045,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000295,0.0,0.000094,0.0,0.0,0.0,0.000416,0.000000,7.156821e-05,216.0
3,0.000345,0.000575,0.015721,0.000016,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000518,0.0,0.000354,0.0,0.0,0.0,0.000119,0.000000,6.695909e-04,216.0
4,0.000010,0.000412,0.008187,0.000243,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.001080,0.0,0.000209,0.0,0.0,0.0,0.000252,0.000000,4.262804e-04,216.0
5,0.000099,0.000099,0.007278,0.000131,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000368,0.0,0.000104,0.0,0.0,0.0,0.000090,0.000000,1.284313e-05,216.0
6,0.000343,0.000360,0.018827,0.000058,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.001408,0.0,0.000038,0.0,0.0,0.0,0.000119,0.000000,7.922625e-04,216.0
7,0.000031,0.000096,0.002851,0.000061,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000082,0.0,0.000097,0.0,0.0,0.0,0.000291,0.000000,8.190183e-06,216.0
8,0.000016,0.000130,0.000119,0.000278,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.001445,0.0,0.000013,0.0,0.0,0.0,0.000826,0.000000,9.920758e-05,216.0
9,0.000055,0.000374,0.012021,0.000093,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.000669,0.0,0.000172,0.0,0.0,0.0,0.000310,0.000000,4.343617e-04,216.0


## Pickle Model

In [23]:
from joblib import dump, load
dump(clf, 'mini-model-08-31.joblib')

['mini-model-08-31.joblib']

## Example Usage in Web App

In [38]:
# 1. Import model (upon startup of the webapp)
from joblib import load
clf = load('mini-model-08-31.joblib')

# 2. Obtain form data
N_SYMPTOMS = 278
example = [False]*N_SYMPTOMS # 278 is the number of symptoms we consider (use 321 for the web app)
example[0] = True
example[100] = True # example symptom selections

# 3. Transform form data into row proportions
def row_prop(data):
    # assuming data is a list of booleans
    s = sum(data)
    for i, val in enumerate(data):
        data[i] = (val * 1.0) / s
    return data

example_prop = row_prop(example)

# 4. Feed model form proportions
pred = model_predict(clf, [example_prop])
pred

array([216.])

In [39]:
# 5. create disease_to_info [Copied from other notebook]
import pandas as pd
df = pd.read_csv('Disease_names.csv')

names = df['Disease_Name'].tolist()

webmd_url = "https://www.webmd.com/search/search_results/default.aspx?query="
disease_to_links = {}
for disease in names:
    disease_to_links[disease] = webmd_url + disease.replace(" ", "%20")

# final diseaseid to info list of tuples
diseaseid_to_info = []
for i, disease in enumerate(names):
    diseaseid_to_info.append((disease, disease_to_links[disease]))
diseaseid_to_info

[('Skull Fracture, Basilar',
  'https://www.webmd.com/search/search_results/default.aspx?query=Skull%20Fracture,%20Basilar'),
 ('Blind Loop Syndrome',
  'https://www.webmd.com/search/search_results/default.aspx?query=Blind%20Loop%20Syndrome'),
 ('Mastocytoma',
  'https://www.webmd.com/search/search_results/default.aspx?query=Mastocytoma'),
 ('Carcinoma, Adenoid Cystic',
  'https://www.webmd.com/search/search_results/default.aspx?query=Carcinoma,%20Adenoid%20Cystic'),
 ('Salivary Duct Calculi',
  'https://www.webmd.com/search/search_results/default.aspx?query=Salivary%20Duct%20Calculi'),
 ('Mycoplasmatales Infections',
  'https://www.webmd.com/search/search_results/default.aspx?query=Mycoplasmatales%20Infections'),
 ('Biliary Tract Neoplasms',
  'https://www.webmd.com/search/search_results/default.aspx?query=Biliary%20Tract%20Neoplasms'),
 ('Pallister-Hall Syndrome',
  'https://www.webmd.com/search/search_results/default.aspx?query=Pallister-Hall%20Syndrome'),
 ('Gardner Syndrome',
  'h

In [40]:
print('Your disease is: {}'.format(diseaseid_to_info[int(pred[0])][0]))

Your disease is: Paratuberculosis


## Send Data to Context

In [41]:
context = {}
results = {}

pred_id = int(pred[0])
pred_0 = {}
pred_0['disease_name'] = diseaseid_to_info[pred_id][0]
pred_0['disease_link'] = diseaseid_to_info[pred_id][1]
results['pred_0'] = pred_0
context['results'] = results
context

{'results': {'pred_0': {'disease_name': 'Paratuberculosis',
   'disease_link': 'https://www.webmd.com/search/search_results/default.aspx?query=Paratuberculosis'}}}