## Self-Learning Tutorial
### James Holbrook



Step 1) Import files

Run the code below, then select 'Choose Files' and select all provided files: patients.csv, observations.csv, conditions.csv.

In [25]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# Source of data:
 # https://data.world/siyeh/synthetic-medical-data


observations_raw = pd.read_csv('observations.csv')
patients_raw = pd.read_csv('patients.csv')
conditions_raw = pd.read_csv('conditions.csv')


# Make a copy of each dataset so we do not have to re-import if we want to reverse a change in the data.

## Print a small sample of data

In [2]:
patients = patients_raw.copy()
patients.head()

Unnamed: 0,patient,birthdate,deathdate,ssn,drivers,passport,prefix,first,last,suffix,maiden,marital,race,ethnicity,gender,birthplace,address
0,4ee2c837-e60f-4c54-9fdf-8686bc70760b,1929-04-08,2029-11-11,999-78-5976,,,,Rosamaria,Pfannerstill,,,,black,dominican,F,Pittsfield MA US,18797 Karson Burgs Suite 444 Palmer Town MA 01...
1,efaf74f9-3de3-45dd-a5d5-26d08e8a3190,2016-12-15,2020-02-19,999-59-9186,,,,Loan,Bashirian,,,,white,american,F,Medford MA US,301 Eula Radial Suite 298 Brockton MA 02305 US
2,aaa4c718-2f48-4c13-9ad0-d287cf280824,1943-11-28,2017-10-22,999-43-3780,S99992928,FALSE,Mr.,Angelo,Buckridge,,,S,black,african,M,Framingham MA US,8693 Fred Crossroad New Bedford MA 02746 US
3,a1851c06-804e-4f31-9d8f-388cd52d4ad0,1954-10-22,2017-10-13,999-53-5542,S99975961,X98167138X,Mrs.,Cami,Terry,,Schuster,M,white,english,F,Hudson MA US,344 Olson Road Apt. 936 Attleboro MA 02703 US
4,48074b70-4db4-4ab0-b9e8-361bd2ba6216,1935-04-08,2017-09-06,999-34-8549,S99997003,X65866752X,Mr.,Giovanni,Russel,,,M,hispanic,puerto_rican,M,Westfield MA US,5780 Corwin Trafficway Dartmouth MA 02714 US


In [3]:
conditions = conditions_raw.copy()
conditions.head()


Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2009-01-08,2009-01-21,71949668-1c2e-43ae-ab0a-64654608defb,4d451e22-a354-40c9-8b33-b6126158666d,10509002,Acute bronchitis (disorder)
1,2010-10-16,2010-10-23,71949668-1c2e-43ae-ab0a-64654608defb,bed7ecff-b41c-422b-beac-ea00c8b02837,38822007,Cystitis
2,2013-02-07,2013-02-27,71949668-1c2e-43ae-ab0a-64654608defb,6f2e3935-b203-493e-a9c0-f23e847b9798,10509002,Acute bronchitis (disorder)
3,2013-10-19,2014-05-17,71949668-1c2e-43ae-ab0a-64654608defb,da4fd626-e74e-4930-91be-7fb3da7ea098,72892002,Normal pregnancy
4,2014-01-28,2014-02-10,71949668-1c2e-43ae-ab0a-64654608defb,b2e12445-b771-4738-944b-95cf6bbe76eb,195662009,Acute viral pharyngitis (disorder)


In [4]:
observations = observations_raw.copy()
observations.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS
0,2008-03-11,71949668-1c2e-43ae-ab0a-64654608defb,5114a5b4-64b8-47b2-82a6-0ce24aae0943,8302-2,Body Height,166.03,cm
1,2008-03-11,71949668-1c2e-43ae-ab0a-64654608defb,5114a5b4-64b8-47b2-82a6-0ce24aae0943,29463-7,Body Weight,54.42,kg
2,2008-03-11,71949668-1c2e-43ae-ab0a-64654608defb,5114a5b4-64b8-47b2-82a6-0ce24aae0943,39156-5,Body Mass Index,19.74,kg/m2
3,2008-03-11,71949668-1c2e-43ae-ab0a-64654608defb,5114a5b4-64b8-47b2-82a6-0ce24aae0943,8480-6,Systolic Blood Pressure,139.0,mmHg
4,2008-03-11,71949668-1c2e-43ae-ab0a-64654608defb,5114a5b4-64b8-47b2-82a6-0ce24aae0943,8462-4,Diastolic Blood Pressure,89.0,mmHg


# Data Observations and Cleaning


### Clean Observations

In [5]:
observations['DESCRIPTION'].value_counts()

DESCRIPTION
Body Height                                               8473
Systolic Blood Pressure                                   8473
Diastolic Blood Pressure                                  8473
Body Weight                                               8473
Body Mass Index                                           7296
                                                          ... 
Familial Alzheimer's disease of early onset (disorder)       3
Malignant tumor of colon                                     3
End stage renal disease (disorder)                           2
Secondary malignant neoplasm of colon                        1
Burn injury(morphologic abnormality)                         1
Name: count, Length: 73, dtype: int64

I am going to keep the observations types that are most common in order to have a more complete dataset for patients.

Then take the average of eacch measurement to get one patient per observation. This doesn't take into account the timing of each measurement, but should work fo this exercise.

In [6]:
observations_list = ['Body Height','Systolic Blood Pressure','Diastolic Blood Pressure','Body Mass Index']
observations = observations[observations['DESCRIPTION'].isin(observations_list)]
observations['value']= observations['VALUE'].astype(float)



In [7]:
patient_observation_avg = observations[['PATIENT','DESCRIPTION','UNITS','value']].groupby(['PATIENT','DESCRIPTION','UNITS'], as_index=False).mean()

patient_observation_avg.head()

Unnamed: 0,PATIENT,DESCRIPTION,UNITS,value
0,00341a88-1cc1-4b39-b0f9-05b0531991a0,Body Height,cm,151.11
1,00341a88-1cc1-4b39-b0f9-05b0531991a0,Body Mass Index,kg/m2,40.7225
2,00341a88-1cc1-4b39-b0f9-05b0531991a0,Diastolic Blood Pressure,mmHg,78.0
3,00341a88-1cc1-4b39-b0f9-05b0531991a0,Systolic Blood Pressure,mmHg,116.5
4,004a5922-7c4d-40cc-a0f8-68f607044c99,Body Height,cm,77.191


### Clean Conditions Data

In [8]:
conditions['DESCRIPTION'].value_counts().head(10)


DESCRIPTION
Viral sinusitis (disorder)              1125
Acute viral pharyngitis (disorder)       602
Acute bronchitis (disorder)              508
Prediabetes                              458
Hypertension                             373
Normal pregnancy                         339
Chronic sinusitis (disorder)             329
Otitis media                             202
Streptococcal sore throat (disorder)     146
Sprain of ankle                          108
Name: count, dtype: int64

There are many conditions that have similar diagnosis but have slightly different names. I am going to group them in to categories in order to have a cleaner dataset for predition.

The four groups are:

Respiratory

Heart

Diabetes

Physical

In [38]:
def group_conditions(x):
  if x in ['Viral sinusitis (disorder)','Acute viral pharyngitis (disorder)','Acute bronchitis (disorder)','Chronic sinusitis (disorder)']:
    return 'Respiratory'
  if x in ['Coronary Heart Disease','Cardiac Arrest','Stroke','Hypertension']:
    return 'heart'
  x = x.lower()
  if 'sinusitis' in x:
    return 'Respiratory'
  if 'throat' in x:
    return 'Respiratory'
  if 'diabetes' in x:
    return 'diabetes'
  if 'sprain' in x:
    return 'physical'
  if 'fracture' in x:
    return 'physical'
  if 'concussion' in x:
    return 'physical'
  else:
    return x




In [39]:
conditions['description_group'] = conditions['DESCRIPTION'].apply(group_conditions)
conditions_list = conditions['description_group'].value_counts().head(4).index.tolist()
conditions = conditions[conditions['description_group'].isin(conditions_list)]

conditions['description_group'].value_counts()



description_group
Respiratory    2687
diabetes        697
heart           635
physical        430
Name: count, dtype: int64

In [11]:

# Get most common condition for each patient in order to have one condition per patient.
patient_condition_mode = conditions[['PATIENT','description_group','CODE']].groupby(['PATIENT', 'description_group'], as_index=False).count().sort_values(by='CODE', ascending=False).groupby(['PATIENT'], as_index=False).head(1)[['PATIENT', 'description_group']]

patient_condition_mode.head()


Unnamed: 0,PATIENT,description_group
1826,bc6fbe62-116e-424f-943c-bae29fa9f319,Respiratory
60,063df653-8a3a-4f2d-b86d-3e82f20fc29a,Respiratory
2326,f37ed933-9ad2-45b4-969d-a782cf676a40,Respiratory
25,04090939-7263-4562-90c7-547d117113f2,Respiratory
2182,e5e6332c-285e-4931-9818-5381d83dde03,Respiratory


In [12]:
patient_features = patient_observation_avg.pivot(index='PATIENT', columns='DESCRIPTION', values='value').reset_index()
patient_features.head()

DESCRIPTION,PATIENT,Body Height,Body Mass Index,Diastolic Blood Pressure,Systolic Blood Pressure
0,00341a88-1cc1-4b39-b0f9-05b0531991a0,151.11,40.7225,78.0,116.5
1,004a5922-7c4d-40cc-a0f8-68f607044c99,77.191,15.16,80.2,120.8
2,00630ce3-e8eb-4ed4-889b-2c0ac257cbf4,136.904,15.485,73.1,120.5
3,007cbcc1-7333-4c79-b5e9-ffa93822fa11,153.473333,18.897778,79.111111,120.222222
4,00b7297c-9997-455d-b581-f1bd677f5012,78.929231,16.476,77.615385,114.692308


In [31]:
patients['bday_year'] = patients['birthdate'].apply(lambda x:x.split('-')[0])
patients['age_est'] = 2017 - patients['bday_year'].astype(int)

patients[['birthdate','bday_year','age_est']].head()

Unnamed: 0,birthdate,bday_year,age_est
0,1929-04-08,1929,88
1,2016-12-15,2016,1
2,1943-11-28,1943,74
3,1954-10-22,1954,63
4,1935-04-08,1935,82


In [14]:
patient_condition = patient_condition_mode.merge(patient_features, on='PATIENT')

df = patient_condition.merge(patients[['patient','age_est','race','ethnicity','gender']], left_on='PATIENT',right_on='patient')

features = ['Body Height','Body Mass Index','Diastolic Blood Pressure','Systolic Blood Pressure','age_est','race']

df[features].isna().sum()


Body Height                  0
Body Mass Index             10
Diastolic Blood Pressure     0
Systolic Blood Pressure      0
age_est                      0
race                         0
dtype: int64

In [32]:
df['Body Mass Index'] = df['Body Mass Index'].fillna(df['Body Mass Index'].mean())



In [41]:
## Setup Model
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import log_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


target = 'description_group'
X = pd.get_dummies(df[features])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y , random_state=22, test_size=0.25, shuffle=True)

y.value_counts()/len(y)


description_group
Respiratory    0.784703
physical       0.082153
heart          0.079320
diabetes       0.053824
Name: count, dtype: float64

## Feature Selection

description_group
Respiratory    0.784703
physical       0.082153
heart          0.079320
diabetes       0.053824
Name: count, dtype: float64

In [17]:
rf = RandomForestClassifier()
rfe = RFE(rf, n_features_to_select=4)

rfe.fit(X, y)
print(rfe.support_)
print(rfe.ranking_)

pd.DataFrame(zip(features, rfe.support_))



[ True  True  True  True False False False False False]
[1 1 1 1 2 6 5 4 3]


Unnamed: 0,0,1
0,Body Height,True
1,Body Mass Index,True
2,Diastolic Blood Pressure,True
3,Systolic Blood Pressure,True
4,age_est,False
5,race,False


In [18]:
model_features = ['Body Height','Body Mass Index','Diastolic Blood Pressure','Systolic Blood Pressure']

X = pd.get_dummies(df[model_features])
y = df[target]

rf = RandomForestClassifier(50)
rf.fit(X_train,y_train)

pred_probs = rf.predict_proba(X_test)
preds = rf.predict(X_test)


In [42]:
accuracy_score(preds,y_test)

0.7584905660377359

In [43]:
cv_pred = cross_val_predict(rf, X, y, cv=4)

accuracy_score(preds,y_test)


0.7584905660377359

## AutoMl Exercise

New Prediction: See if the patient has an elevated risk of a specific disease. Even if the prediction is no less accurate, if the probability % can be helpful then that is very useful to a Doctor.

In [23]:
df['is_respiratory'] = np.where(df['description_group']=='Respiratory',1,0)
df['is_diabetes'] = np.where(df['description_group']=='physical',1,0)
df['is_heart'] = np.where(df['description_group']=='heart',1,0)
df['is_physical'] = np.where(df['description_group']=='diabetes',1,0)


targets = ['is_respiratory', 'is_diabetes', 'is_heart', 'is_physical']
baseline_accuracies = []
baseline_losses = []
accuracy_scores = []
log_losses = []

for i in targets:
  cv_pred = cross_val_predict(rf, X, df[i], cv=4)
  cv_pred_proba = cross_val_predict(rf, X, df[i], cv=4, method='predict_proba')
  accuracy = accuracy_score(df[i], cv_pred)
  loss = log_loss(df[i], cv_pred_proba)
  if i =='is_respiratory':
    baseline_accuracy = df[i].sum()/len(df)
    baseline_loss = log_loss(df[i], [1 for x in range(len(df))])
  else:
    baseline_accuracy =  1-df[i].sum()/len(df)
    baseline_loss = log_loss(df[i], [0 for x in range(len(df))])


  baseline_accuracies.append(baseline_accuracy)
  baseline_losses.append(baseline_loss)
  accuracy_scores.append(accuracy)
  log_losses.append(loss)


eval_df = pd.DataFrame()

eval_df['targets'] = targets
eval_df['baseline_accuracy'] = baseline_accuracies
eval_df['baseline_loss'] = baseline_losses
eval_df['accuracy'] = accuracy_scores
eval_df['loss'] = log_losses

eval_df['accuracy_difference'] = eval_df['accuracy'] - eval_df['baseline_accuracy']
eval_df['loss_difference'] = eval_df['loss'] - eval_df['baseline_loss']

eval_df



Unnamed: 0,targets,baseline_accuracy,baseline_loss,accuracy,loss,accuracy_difference,loss_difference
0,is_respiratory,0.784703,7.760107,0.751653,0.690866,-0.03305,-7.06924
1,is_diabetes,0.917847,2.961093,0.915958,0.677719,-0.001889,-2.283375
2,is_heart,0.92068,2.858987,0.917847,0.466722,-0.002833,-2.392265
3,is_physical,0.946176,1.940027,0.945231,0.713964,-0.000944,-1.226062


This shows up that the accuracy is hard to beat from the baseline. However, using the predict_proba gives us a % chance of the disease. Comparing the loss shows us there is some information that could be useful. A heart condition prediction, for example, has a much lower log loss compared to the baseline of predicting the user has no heart condition.