Python Port of PheCAP Algorithm

Replicate the process in main.R.

In [1]:
import sys

sys.path.append('../src')

In [2]:
import pandas as pd

from pyphecap.training import train_phenotyping_model
from pyphecap.feature_extraction import run_feature_extraction
from pyphecap.phecap_data import add_validation_column, Data
from pyphecap.surrogate import Surrogates, Surrogate

In [3]:
ehr_data = pd.read_csv('../data/ehr_data.csv')  # read sample data output from R
ehr_data.shape

(10000, 590)

In [4]:
# gold standard
ehr_data[~pd.isnull(ehr_data.label)]['label'].value_counts()

1.0    119
0.0     62
Name: label, dtype: int64

In [5]:
add_validation_column(ehr_data)
data = Data(
    ehr_data,
    'healthcare_utilization',
    'label',
    'validation',
)

In [6]:
surrogates = Surrogates(
    Surrogate("main_ICD"),
    Surrogate("main_NLP"),
    Surrogate("main_ICD", "main_NLP"),
)

In [7]:
selected_features = run_feature_extraction(data, surrogates)
selected_features

['main_NLP',
 'main_ICD',
 'NLP6',
 'NLP56',
 'NLP160',
 'NLP161',
 'NLP306',
 'NLP309',
 'NLP403',
 'NLP536',
 'NLP564']

In [8]:
coefficients, train_roc, split_roc = train_phenotyping_model(data, surrogates, selected_features)
coefficients

[('intercept', 0.782504569449425),
 ('main_ICD', 0.8980490622340085),
 ('main_NLP', 0.9679868483065888),
 ('main_ICD__main_NLP', 0.6305458364486853),
 ('healthcare_utilization', 0.7854994741783271),
 ('NLP536', 0.7771180276158706),
 ('NLP306', 0.7679186586220257),
 ('NLP403', 0.8710968564652215),
 ('NLP6', 0.9301900813708939),
 ('NLP160', 0.8708363591486759),
 ('NLP309', 0.6705519357242544),
 ('NLP564', 0.7979710032249674),
 ('NLP56', 0.7354525619722935)]

In [None]:
# TODO
phenotype = predict_phenotype(data, coefficients)
phenotype

In [None]:
# TODO
validation = validate_phenotyping_model(data, coefficients)
validation

In [None]:
# TODO
plot_roc_curves(validation)