In [7]:
%run ../00_default_options.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
from robotehr.api.training import get_training_results
from robotehr.pipelines import training
from robotehr.models.cohort import Cohort, OnsetDataFrame
from robotehr.models.data import FeaturePipeline
from robotehr.pipelines.supporters.preprocessing import DataLoader

In [9]:
cohort = Cohort.load(id=1)

In [11]:
onset_dataframe = OnsetDataFrame.load(id=15)

In [12]:
feature_pipeline = FeaturePipeline.load(id=21)

In [13]:
import morpher.config
from morpher.jobs import *

In [14]:
regexes = [
    'any__Diagnosis__',
    'any__Procedure__', 
    'any__Drug__', 
    'any__Material__', 
    'any__Encounter__', 
    'any__AlcoholUse__', 
    'any__DrugUse__', 
    'any__TobaccoUse__',
    'min__height__', 
    'min__weight__', 
    'min__vitalsign__', 
    'min__measurement__', 
    'min__labvalue__', 
    'max__height__', 
    'max__weight__', 
    'max__vitalsign__', 
    'max__measurement__', 
    'max__labvalue__'
]

column_selector = ""
for r in regexes:
    column_selector += r + "|"
column_selector = column_selector[:-1]

In [15]:
class CustomDataLoader(DataLoader):
    def transform(self, X, y):
        from sklearn.preprocessing import OneHotEncoder
        from enum import Enum

        # remove unused features
        del X['medical_record_number']
        del X['mother_account_number']
        del X['date_of_birth']
        del X['month_of_birth']
        del X['patient_ethnic_group']
        del X['religion']
        del X['address_zip']
        del X['deceased_indicator']
        del X['marital_status_code']

        class RaceType(str, Enum):
            AFRICAN = 'African'
            AMERICAN_BLACK = 'Black or African-American'
            AMERICAN_NATIVE = 'Native American'
            ASIAN = 'Asian'
            ASIAN_PACIFIC = 'Asian Pacific'
            ASIAN_INDIAN = 'Asian Indian'
            ASIAN_CHINESE = 'Asian Chinese'
            HISPANIC = 'Hispanic or Latino'
            OTHER = 'Other'
            WHITE = 'White'

        RACE_MAPPING = {
            RaceType.AFRICAN: [
                'Cape Verdian',
                'Congolese',
                'Eritrean',
                'Ethiopian',
                'Gabonian',
                'Ghanaian',
                'Guinean',
                'Ivory Coastian',
                'Kenyan',
                'Liberian',
                'Madagascar',
                'Malian',
                'Nigerian',
                'Other: East African',
                'Other: North African',
                'Other: South African',
                'Other: West African',
                'Senegalese',
                'Sierra Leonean',
                'Somalian',
                'Sudanese',
                'Tanzanian',
                'Togolese',
                'Ugandan',
                'Zimbabwean'
            ],
            RaceType.AMERICAN_BLACK: [
                'African American (Black)',
                'African-American',
                'Black Or African-American',
                'Black or African - American',
            ],
            RaceType.AMERICAN_NATIVE: [
                'American (Indian/Alaskan)',
                'Native American'
            ],
            RaceType.ASIAN: [
                'Asian',
                'Bangladeshi',
                'Bhutanese',
                'Burmese',
                'Cambodian',
                'Hmong',
                'Indonesian',
                'Japanese',
                'Korean',
                'Laotian',
                'Malaysian',
                'Maldivian',
                'Nepalese',
                'Okinawan',
                'Pakistani',
                'Singaporean',
                'Taiwanese',
                'Thai',
                'Vietnamese',
                'Yapese'
            ],
            RaceType.ASIAN_PACIFIC: [
                'Asian (Pacific Islander)',
                'Carolinian',
                'Chamorro',
                'Chuukese',
                'Fijian',
                'Filipino',
                'Guamanian',
                'Guamanian Or Chamorro',
                'Guamanian or Chamorro',
                'Iwo Jiman',
                'Kiribati',
                'Kosraean',
                'Mariana Islander',
                'Marshallese',
                'Melanesian',
                'Micronesian',
                'Native Hawaiian',
                'New Hebrides',
                'Other Pacific Islander',
                'Pacific Islander',
                'Palauan',
                'Pohnpeian',
                'Polynesian',
                'Saipanese',
                'Samoan',
                'Papua New Guinean',
                'Tahitian',
                'Tokelauan',
                'Tongan'
            ],
            RaceType.ASIAN_INDIAN: [
                'Asian Indian',
                'Sri Lankan',
                'Sri lankan',
                'West Indian'
            ],
            RaceType.ASIAN_CHINESE: [
                'Chinese',
            ],
            RaceType.HISPANIC: [
                'Barbadian',
                'Dominica Islander',
                'Grenadian',
                'Haitian',
                'Hispanic/Latino',
                'Jamaican',
                'St Vincentian',
                'Trinidadian'
            ],
            RaceType.OTHER: [
                '',
                'Aa',
                'Ab',
                'Af',
                'Ag',
                'Ak',
                'Al',
                'Ap',
                'Ar',
                'Av',
                'Ay',
                'B',
                'B1',
                'B2',
                'B3',
                'B4',
                'B5',
                'B6',
                'B7',
                'B8',
                'B9',
                'Ba',
                'Bb',
                'Bc',
                'Bd',
                'Be',
                'Bf',
                'Bg',
                'Bh',
                'Bj',
                'Bk',
                'Bm',
                'Bn',
                'Bo',
                'Bp',
                'Bq',
                'Br',
                'Bs',
                'Bt',
                'Bu',
                'Bv',
                'Bw',
                'Bx',
                'By',
                'Bz',
                'I',
                'MSDW_NOT APPLICABLE',
                'MSDW_OTHER',
                'MSDW_UNKNOWN',
                'NOT AVAILABLE',
                'Non Hispanic',
                'O',
                'Other',
                'Pk',
                'Pl',
                'Pm',
                'Po',
                'Ps',
                'Pv',
                'U',
                'Unk',
                'Unknown',
                'W'
            ],
            RaceType.WHITE: [
                'Caucasian (White)',
                'White'
            ]
        }

        # value mapping
        X['race'] = (
            X.race.map({
                label: cat for cat, labels in RACE_MAPPING.items()
                for label in labels
            }).astype(pd.api.types.CategoricalDtype(RaceType))
        )

        # pre-encoding nan handling
        for column in ['gender', 'race']:
            enc = OneHotEncoder(sparse=False)
            transformed_data = enc.fit_transform(X[[column]])
            transformed_columns = pd.DataFrame(
                data=transformed_data, 
                columns=[f'{column}_{c}' for c in enc.categories_[0]]
            )
            X = X.join(transformed_columns)
            del X[column]
        X[X.columns[X.columns.str.contains('any')]] = X[X.columns[X.columns.str.contains('any')]].fillna(False)
        return X, y
        
    def transform_training_data(self, X_train, y_train):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        self.objects['scaler'] = scaler
        X_train_scaled = scaler.fit_transform(X_train)
        
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer()
        self.objects['imputer'] = imputer
        X_train_imputed = imputer.fit_transform(X_train_scaled)
        
        X_train = pd.DataFrame(
            data=X_train_imputed, 
            columns=X_train.columns,
            index=X_train.index
        )
        return X_train, y_train
    
    def transform_test_data(self, X_test, y_test):
        scaler = self.objects['scaler']
        imputer = self.objects['imputer']
        X_test_scaled = scaler.transform(X_test)
        X_test_imputed = imputer.fit_transform(X_test_scaled)
        
        X_test = pd.DataFrame(
            data=X_test_imputed, 
            columns=X_test.columns,
            index=X_test.index
        )
        return X_test, y_test
        

data_loader = CustomDataLoader(column_selector=column_selector)

In [16]:
from itertools import product

In [17]:
thresholds_numeric = [(x / 100) for x in range(5, 100, 5)]
observation_windows_numeric = [[x, -1] for x in range(-361, -1, 30)]
thresholds_occurring = [(x / 100) for x in range(5, 100, 5)]
observation_windows_occurring = [[x, -1] for x in range(-361, -1, 30)]

In [18]:
iterator = product(
    thresholds_numeric,
    observation_windows_numeric,
    thresholds_occurring,
    observation_windows_occurring
)

configs = [x for x in iterator if x[0] == x[2] and x[1] == x[3]]
targets = ['cytomegaloviral_disease_onset_from_0_days_after_to_365_days_after']
algorithms = [morpher.config.algorithms.GBDT, morpher.config.algorithms.RF, morpher.config.algorithms.DT, morpher.config.algorithms.LR]
samplers = [morpher.config.samplers.RANDOM, morpher.config.samplers.URANDOM, morpher.config.samplers.BORDERLINE, morpher.config.samplers.SMOTE, morpher.config.samplers.NOSAMPLER]

In [None]:
pipeline = training.execute(
    comment='NEW baseline-4 cmv with RFE',
    version='100.4.1',
    cohort=cohort,
    onset_dataframe=onset_dataframe,
    feature_pipeline=feature_pipeline,
    data_loader=data_loader,
    observation_iterator=configs,
    targets=targets,
    algorithms=algorithms,
    samplers=samplers,
    feature_type_occurring="occurring",
    feature_type_numeric="numeric_binned",
    bin_size=30,
    rfe__run=True,
    rfe__step_size=50
)

Columns (11,18,19,20,24,35,64,83,90,92,94,96,97,106,114,117,120,125,126,133,136,140,141,142,143,144,150,178,235,244,248,288,311,317,320,325,326,330,334,341,353,359,360,361,365,402,421,425,430,434,436,439,446) have mixed types. Specify dtype option on import or set low_memory=False.


Fetching data for <fiber.condition.mrns.MRNs object at 0x7f6790e57668>
Fetching data for Patient (...)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 753.315ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 487.926ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 203.727ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 227.742ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 589.37ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 245.277ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 4.916ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 554.042ms

*** Training of model 'GradientBoostingClassifier' 

F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.
invalid value encountered in long_scalars
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 608.184ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 660.358ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 181.053ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 272.804ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 669.537ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 289.95ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 443.271ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 635.552ms

*** Training of model 'GradientBoostingClassifier

F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.
invalid value encountered in long_scalars
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 294.952ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 865.776ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 715.431ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 264.519ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 405.856ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 976.478ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 471.452ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 619.467ms

*** Training of model 'GradientBoostingClassifie

F-score is ill-defined and being set to 0.0 due to no predicted samples.
Precision is ill-defined and being set to 0.0 due to no predicted samples.
invalid value encountered in long_scalars
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 235.139ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 501.373ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 447.851ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 312.86ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 551.61ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 266.012ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 439.772ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 690.504ms

*** Training of model 'GradientBoostingClassifier'

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 451.503ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 834.237ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 60.408ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 102.498ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 869.196ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 238.643ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 870.634ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 684.677ms

*** Training of model 'GradientBoostingClassifier

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 919.848ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 814.115ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 856.616ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 380.002ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 253.924ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 125.062ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 973.269ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 474.098ms

*** Training of model 'GradientBoostingClassifie

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 134.814ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 863.587ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 395.549ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 605.528ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 859.028ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 394.521ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 694.67ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 499.64ms

*** Training of model 'GradientBoostingClassifier'

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 279.962ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 380.946ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 79.536ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 363.41ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 181.102ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 36.624ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 972.348ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 878.603ms

*** Training of model 'GradientBoostingClassifier' 

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 484.767ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 389.949ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 429.544ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 486.676ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 947.352ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 945.477ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 698.634ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 764.264ms

*** Training of model 'GradientBoostingClassifie

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 398.671ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 722.124ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 639.519ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 70.116ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 908.667ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 921.221ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 522.458ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 379.498ms

*** Training of model 'GradientBoostingClassifier

Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.
Data with input dtype bool, float64 were all converted to float64 by StandardScaler.


*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 375.482ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 247.171ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 107.174ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 666.672ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 141.169ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 231.595ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 270.321ms

*** Training of model 'GradientBoostingClassifier' started.
*** Training of classifier ready. Time elapsed: 802.914ms

*** Training of model 'GradientBoostingClassifie

In [None]:
pipeline.id