Python Port of PheCAP Algorithm

Replicate the process in main.R.

In [1]:
import sys

sys.path.append('../src')

In [2]:
import pandas as pd


from pyphecap.predict import predict_phenotype, validate_phenotyping_model
from pyphecap.training import train_phenotyping_model
from pyphecap.feature_extraction import run_feature_extraction
from pyphecap.phecap_data import add_validation_column, Data
from pyphecap.surrogate import Surrogates, Surrogate

In [3]:
ehr_data = pd.read_csv('../data/ehr_data.csv')  # read sample data output from R
ehr_data.shape

(10000, 590)

In [4]:
# gold standard
ehr_data[~pd.isnull(ehr_data.label)]['label'].value_counts()

1.0    119
0.0     62
Name: label, dtype: int64

In [5]:
add_validation_column(ehr_data)
data = Data(
    ehr_data,
    'healthcare_utilization',
    'label',
    'validation',
)

In [6]:
surrogates = Surrogates(
    Surrogate("main_ICD"),
    Surrogate("main_NLP"),
    Surrogate("main_ICD", "main_NLP"),
)

In [7]:
selected_features = run_feature_extraction(data, surrogates)
selected_features

['main_NLP',
 'main_ICD',
 'NLP6',
 'NLP56',
 'NLP160',
 'NLP161',
 'NLP306',
 'NLP309',
 'NLP403',
 'NLP536',
 'NLP564']

In [8]:
coefficients, train_roc, split_roc = train_phenotyping_model(data, surrogates, selected_features)
coefficients

[('intercept', 0.6268718575234931),
 ('main_ICD', 0.6337449679412155),
 ('main_NLP', 1.003751942804733),
 ('main_ICD__main_NLP', 0.4834152159007125),
 ('healthcare_utilization', 0.5874735267968327),
 ('NLP306', 0.6868921584917298),
 ('NLP160', 0.6024910109329423),
 ('NLP564', 0.6432881298421733),
 ('NLP161', 0.7735609098166827),
 ('NLP6', 0.6587425381742472),
 ('NLP309', 0.5728663342305981),
 ('NLP403', 0.7584180805634746),
 ('NLP536', 0.6954064677114601)]

In [9]:
phenotype = predict_phenotype(data, surrogates, coefficients, selected_features)
phenotype

array([ 2.26650268, 16.63737349,  6.0610467 , ...,  1.51065185,
        1.44694986,  3.6321152 ])

In [10]:
roc, auc = validate_phenotyping_model(data, surrogates, coefficients, selected_features)
roc

0.8031674208144797

In [None]:
# TODO
plot_roc_curves(roc, auc)