We'll start by loading the data.

In [20]:
import numpy as np
import pandas as pd
import sklearn

microbiology_events = pd.read_csv("../data/raw/mimic-iii-demo/MICROBIOLOGYEVENTS.csv")
admission = pd.read_csv("../data/raw/mimic-iii-demo/ADMISSIONS.csv")
patient = pd.read_csv("../data/raw/mimic-iii-demo/PATIENTS.csv")

In [21]:
demographics = pd.merge(admission, patient, \
                        on='subject_id')
demographics= pd.merge(demographics, microbiology_events[['org_name','hadm_id']],\
                       on ='hadm_id',\
                       how='left')

In [22]:
from mlworkflows import featuressimple



In [23]:
simple_summary = featuressimple.SimpleSummaries()

summaries = simple_summary.transform(demographics["org_name"])

In [24]:
from sklearn.pipeline import Pipeline

feat_pipeline = Pipeline([
    ('features',simple_summary)
])

from mlworkflows import util
util.serialize_to(feat_pipeline, "feature_pipeline.sav")

We don't want to our model to attempt to find patterns in the ID number, so we will drop it from the data set.

In [25]:
pt_info = pd.merge(admission, microbiology_events, on='hadm_id')
pt_columns = ['admission_type', 'admission_location', 'diagnosis', 'spec_itemid', 'org_itemid','interpretation']
pt_info = pt_info.loc[ : , pt_columns]

In order to gather insight from the data, we have to find out how to translate our information in a way that a machine will understand. We'll use something called a [One Hot Encoder](link) in order to create numeric columns per input.

In [18]:
from sklearn import preprocessing

demographics = demographics.dropna()
demographics = demographics.astype('str') 

# initialize one hot encoder
enc = preprocessing.OneHotEncoder(sparse='T', )

# fit transforn one hot encoder
encoded_demographics = pd.DataFrame(enc.fit_transform(demographics))

In [19]:
encoded_demographics

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 7)\t1.0\n (0, 20)\t1.0\n..."
1,"(0, 1)\t1.0\n (0, 8)\t1.0\n (0, 17)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 8)\t1.0\n (0, 17)\t1.0\n..."
3,"(0, 1)\t1.0\n (0, 8)\t1.0\n (0, 17)\t1.0\n..."
4,"(0, 1)\t1.0\n (0, 8)\t1.0\n (0, 17)\t1.0\n..."
...,...
98,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."
99,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."
100,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."
101,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."


In [None]:
#more reasonable in a different notebook

#import sklearn.decomposition 
#from sklearn.decomposition import PCA

#pca = sklearn.decomposition.TruncatedSVD(2)

# fit_transform original data, put into data frame
#pca_demographics = pca.fit_transform(demographics)
#df_pca_demographics = pd.DataFrame(pca_demographics, columns=["x", "y"])

# transform new spam data, put into data frame
#pca_pt_info = pca.fit_transform(pt_info)
#df_pca_pt_info = pd.DataFrame(pca_pt_info, columns=["x", "y"])


split into training and test

In [15]:
from sklearn import model_selection
train, test = model_selection.train_test_split(encoded_demographics, random_state=43)

In [16]:
train = pd.DataFrame(train)
train

Unnamed: 0,0
0,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."
1,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."
2,"(0, 5)\t1.0\n (0, 12)\t1.0\n (0, 16)\t1.0\..."
3,"(0, 4)\t1.0\n (0, 11)\t1.0\n (0, 15)\t1.0\..."
4,"(0, 1)\t1.0\n (0, 8)\t1.0\n (0, 17)\t1.0\n..."
...,...
72,"(0, 3)\t1.0\n (0, 10)\t1.0\n (0, 19)\t1.0\..."
73,"(0, 2)\t1.0\n (0, 9)\t1.0\n (0, 14)\t1.0\n..."
74,"(0, 2)\t1.0\n (0, 9)\t1.0\n (0, 14)\t1.0\n..."
75,"(0, 3)\t1.0\n (0, 10)\t1.0\n (0, 19)\t1.0\..."


train

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

clf = sklearn.svm.SVC()
clf.fit(X=train, y=train["mrsa_positive"])
clf.fit(train, test)

In [None]:
predictions = clf.fit(test)

evaluate results

In [None]:
from mlworkflows import plot

df, chart = plot.binary_confusion_matrix(test["mrsa_positive"], predictions)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test.label.values, predictions))

We want to save the model so that we can use it outside of this notebook.

In [None]:
model

In [None]:
from mlworkflows import util
util.serialize_to(model, "model.sav")