# description

sklearn modeling of the median imputed training data. note the preprocessing of data from 07.20-worst_case_model was performed in R (09.newagg2_preprocessing_med_impute.rmd). this eventually will be converted over to python, but for now works in r. 

preprocessing includes variable formatting (categorical to factor variables in r, train/test split, and median imputation).

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import numpy as np
import glob
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.externals.joblib import Memory
from sklearn.metrics import classification_report
memory = Memory(cachedir='/tmp', verbose=0)
#@memory.cache above any def fxn.

%matplotlib inline
plt.style.use('ggplot')

from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
        'width': 1024,
        'height': 768,
        'scroll': True,
})

#reducing warnings that are super common in my model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)


%load_ext autotime

You provided "cachedir='/tmp'", use "location='/tmp'" instead.
  del sys.path[0]


In [2]:
import sys
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, Imputer
from sklearn.model_selection import StratifiedKFold
#from Compute_gower_distance import select_train_samples
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, accuracy_score, auc, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier #conda install -c conda-forge xgboost to install

##adding these, lets see if it helps with xgboost crash
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier

RANDOM_STATE = 15485867

time: 67.2 ms


In [3]:
os.chdir('/Users/geickelb1/Documents/GitHub/mimiciii-antibiotics-modeling') #use to change working directory
wd= os.getcwd() #'/Users/geickelb1/Documents/GitHub/mimiciii-antibiotics-modeling'

date="04042019"
final_pt_df2 = pd.read_csv(Path(wd + '/data/raw/csv/04042019_final_pt_df2_v.csv') , index_col=0) #only for patients with minimum vitals
patients= list(final_pt_df2['subject_id'].unique())
hadm_id= list(final_pt_df2['hadm_id'].unique())
icustay_id= list(final_pt_df2['icustay_id'].unique())
icustay_id= [int(x) for x in icustay_id]

time: 99.4 ms


In [56]:
train_data= pd.read_csv("/Users/geickelb1/Documents/GitHub/mimiciii-antibiotics-modeling/models/imputation/04042019_newagg2_median_imputed_train.csv") #two class training data

time: 70.2 ms


# light data reformatting for model

### most data are already converted to median type zscores, however weight and admit age still need to be converted.

In [57]:
weight_median=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","weight"]+1).median()
weight_quant1=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","weight"]+1).quantile(0.25)#.between(train_data['col'].quantile(.25), df['col'].quantile(.75), inclusive=True)]
weight_quant3=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","weight"]+1).quantile(0.75)
weight_iqr=weight_quant3-weight_quant1; weight_iqr
print(weight_median,weight_quant3,weight_quant1, weight_iqr)

4.356708826689592 4.499809670330265 4.200204952921578 0.29960471740868666
time: 9.86 ms


In [58]:
age_median=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","first_admit_age"]+1).median()
age_quant1=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","first_admit_age"]+1).quantile(0.25)
age_quant3=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","first_admit_age"]+1).quantile(0.75)
age_iqr=age_quant3-age_quant1;
print(age_median,age_quant3,age_quant1, age_iqr)

4.194943760778217 4.367991089683742 3.9691119690666907 0.39887912061705144
time: 9.52 ms


In [59]:
#converting to log scaled standardized data for age/weight
train_data['weight']=train_data['weight'].apply(lambda x: (np.log(x+1)-weight_median)/weight_iqr)
train_data['first_admit_age']=train_data['first_admit_age'].apply(lambda x: (np.log(x+1)-age_median)/age_iqr)

time: 22.9 ms


### onehot encoding categorical var

In [60]:
cols_to_transform=['any_vasoactive', 'leukocyte', 'pao2fio2Ratio', 'vent_recieved']
train_data = pd.get_dummies(train_data, columns = cols_to_transform )
train_data.head()

Unnamed: 0,icustay_id,amax_bun,amax_creatinine,amax_daily_sofa,amax_heartrate,amax_meanartpress,amax_platelet,amax_ptt,amax_sysbp,amax_temperature,...,any_vasoactive_False,any_vasoactive_True,leukocyte_False,leukocyte_True,"pao2fio2Ratio_(0, 200]","pao2fio2Ratio_(200, 333]","pao2fio2Ratio_(333, 475]","pao2fio2Ratio_(475, 3000]",vent_recieved_False,vent_recieved_True
0,200012,0.069095,0.076014,-0.5,0.077448,0.047571,-0.076639,0.01264,-0.022901,0.021964,...,1,0,1,0,0,0,0,1,1,0
1,200014,0.056406,-0.16415,-0.207519,0.021221,0.263979,-0.067398,-0.030164,0.118889,0.003685,...,1,0,1,0,0,1,0,0,0,1
2,200033,-0.068362,-0.253202,-0.5,0.122666,0.125991,-0.061462,-0.034854,0.084386,0.061749,...,0,1,1,0,0,0,0,1,0,1
3,200036,0.136269,0.0,-0.5,0.132424,0.153843,-0.034114,0.105303,0.078839,0.015897,...,1,0,1,0,0,0,0,1,1,0
4,200059,0.287056,0.347655,0.403677,0.127583,0.196365,0.085552,0.52184,0.139305,0.018327,...,0,1,1,0,0,1,0,0,0,1


time: 28 ms


In [9]:
list(train_data)

['icustay_id',
 'amax_bun',
 'amax_creatinine',
 'amax_daily_sofa',
 'amax_heartrate',
 'amax_meanartpress',
 'amax_platelet',
 'amax_ptt',
 'amax_sysbp',
 'amax_temperature',
 'amin_bun',
 'amin_creatinine',
 'amin_daily_sofa',
 'amin_heartrate',
 'amin_meanartpress',
 'amin_platelet',
 'amin_ptt',
 'amin_sysbp',
 'amin_temperature',
 'median_bun',
 'median_creatinine',
 'median_daily_sofa',
 'median_heartrate',
 'median_meanartpress',
 'median_platelet',
 'median_ptt',
 'median_sysbp',
 'median_temperature',
 'std_bun',
 'std_creatinine',
 'std_daily_sofa',
 'std_heartrate',
 'std_meanartpress',
 'std_platelet',
 'std_ptt',
 'std_sysbp',
 'std_temperature',
 'first_admit_age',
 'weight',
 'final_bin',
 'any_vasoactive_False',
 'any_vasoactive_True',
 'leukocyte_False',
 'leukocyte_True',
 'pao2fio2Ratio_(0, 200]',
 'pao2fio2Ratio_(200, 333]',
 'pao2fio2Ratio_(333, 475]',
 'pao2fio2Ratio_(475, 3000]',
 'vent_recieved_False',
 'vent_recieved_True']

time: 2.61 ms


# binarizing outcome for training data

In [61]:
#binarizing and poping outcome for training data
train_data.loc[train_data['final_bin']=="C_pos/A_full","final_bin"]=1
train_data.loc[train_data['final_bin']=="C_neg/A_partial","final_bin"]=0
train_data['final_bin']=pd.to_numeric(train_data['final_bin'])


time: 10.7 ms


# establishing training data and labels

In [11]:
x_train= train_data.copy()
icustay_id=x_train.pop('icustay_id')
y_train= x_train.pop("final_bin").values

time: 5.33 ms


# building a sklearn pipeline
As the name suggests, pipeline class allows sticking multiple processes into a single scikit-learn estimator. pipeline class has fit, predict and score method just like any other estimator (ex. LinearRegression).

To implement pipeline, as usual we separate features and labels from the data-set at first.

if we needed our data to be scaled we would apply that here, but i've already done that.

In [11]:
# if we needed our data to be scaled we would apply that here, but i've already done that.
# from sklearn.preprocessing import StandardScaler

time: 704 µs


Now we are ready to create a pipeline object by providing with the list of steps. 

Here our steps are standard scalar and support vector machine. 

These steps are list of tuples consisting of name and an instance of the transformer or estimator.

In [12]:
# # steps = [('scaler', StandardScaler()), ('SVM', SVC())] #so step 1 is known as scaler, which performs StandardScaler() function on the input. 
# from sklearn.svm import SVC
# steps = [('SVM', SVC())] #removed step 1 since i already scaled my data
# from sklearn.pipeline import Pipeline
# pipeline = Pipeline(steps) # define the pipeline object.

time: 66.8 ms


In [15]:
# steps = [('scaler', StandardScaler()), ('SVM', SVC())] #so step 1 is known as scaler, which performs StandardScaler() function on the input. 
from sklearn.svm import SVC
steps = [('SVM', SVC(gamma="scale"))] #removed step 1 since i already scaled my data. added gamma=scale
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps) # define the pipeline object.

time: 3.46 ms


The strings (‘scaler’, ‘SVM’) can be anything, as these are just names to identify clearly the transform or estimator. We can use make_pipeline instead of Pipeline to avoid naming the estimator or transformer. The final step has to be an estimator in this list of tuples.

if we needed to do train/test split (which i've already done), we could use:

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=30, stratify=Y) #It’s necessary to use stratify as I’ve mentioned before that the labels are imbalanced as most of the wine quality falls in the range 5,6.

#### hypertuning:
SVM is usually optimized using two parameters gamma,C . I will discuss in an upcoming post on how they exactly work, but here let’s define a parameter grid that we will use in GridSearchCV .

In [22]:
parameteres = {'SVM__kernel':('linear', 'rbf'), 'SVM__C':[0.1, 1, 10]} #i think i need to include the SVM__  because i'm passing a pipeline object in.

time: 808 µs


Now we instantiate the GridSearchCV object with pipeline and the parameter space with 5 folds cross validation.


In [23]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5) #pipeline here is basically just adding the model. 

time: 1.3 ms


We can use this to fit on the training data-set and test the algorithm on the training set. Also we can find the best fit parameters for the SVM as below. 
## NOTE: i need to figure out how to extract cv misclass/ other loss parameter.

In [33]:
grid.fit(x_train, y_train)
#print("score = %3.2f") %(grid.score(x_test,y_test))\
print(grid.score(x_train,y_ty_trainest))
print(grid.best_params_)

NameError: name 'rid' is not defined

time: 33.5 s


In [31]:
# print("score = %s ") %(grid.score(x_train,y_train))
# print(grid.best_params_)

score = %s 


TypeError: unsupported operand type(s) for %: 'NoneType' and 'float'

time: 503 ms


In [37]:
print(grid.score(x_train,y_train))
print(grid.best_params_)

0.7816642120765832
{'SVM__C': 10, 'SVM__kernel': 'linear'}
time: 475 ms


In [39]:
grid.cv_results_



{'mean_fit_time': array([0.73107057, 0.79110894, 1.71803112, 0.84989548]),
 'mean_score_time': array([0.07952285, 0.13074217, 0.07977705, 0.11922569]),
 'mean_test_score': array([0.7757732 , 0.76859352, 0.78019146, 0.77632548]),
 'mean_train_score': array([0.77733802, 0.77264354, 0.78221657, 0.79768044]),
 'param_SVM__C': masked_array(data=[1, 1, 10, 10],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_SVM__kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'SVM__C': 1, 'SVM__kernel': 'linear'},
  {'SVM__C': 1, 'SVM__kernel': 'rbf'},
  {'SVM__C': 10, 'SVM__kernel': 'linear'},
  {'SVM__C': 10, 'SVM__kernel': 'rbf'}],
 'rank_test_score': array([3, 4, 1, 2], dtype=int32),
 'split0_test_score': array([0.76724931, 0.76172953, 0.7700092 , 0.77092916]),
 'split0_train_score': array([0.78066743, 0.7735328 , 0.

time: 8.51 ms


In [125]:
(list(x_train)) #5420 x 48

48

time: 4.87 ms


# local methods (trying functions written by postdoc)


* 1)Receive a sample S for testing

* 2)Use gower similarity to find a cohort of K similar case samples and K similar control sample from all original samples, which constructs a cohort of 2K samples.

* 3)Build a predictive model based on the similar sample cohort and predict label for sample S individually.


In [None]:
#Compute_Gower_Distance.py:

In [12]:
import os, sys
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.utils import validation
from sklearn.metrics import pairwise
from scipy.sparse import issparse

time: 3.13 ms


converting to floats

In [13]:
def _return_float_dtype(X, Y):
    ##used in grower distance, converts values to floats for formatting.
    """
    1. If dtype of X and Y is float32, then dtype float32 is returned.
    2. Else dtype float is returned.
    """
    if not issparse(X) and not isinstance(X, np.ndarray):
        X = np.asarray(X)

    if Y is None:
        Y_dtype = X.dtype
    elif not issparse(Y) and not isinstance(Y, np.ndarray):
        Y = np.asarray(Y)
        Y_dtype = Y.dtype
    else:
        Y_dtype = Y.dtype

    if X.dtype == Y_dtype == np.float32:
        dtype = np.float32
    elif X.dtype == np.object and not issparse(X):
        dtype = np.float
        for col in range(X.shape[1]):
            if not np.issubdtype(type(X[0, col]), np.number):
                dtype = np.object
                break
    else:
        dtype = np.float
    return X, Y, dtype


time: 24.2 ms


In [14]:
# x_train_float, y_train_float, dtype =_return_float_dtype(X=x_train, Y=y_train)

time: 518 µs


In [15]:
def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
    ##used in grower distance, checks x and y dimensions against each otehr.
    X, Y, dtype_float = _return_float_dtype(X, Y)

    warn_on_dtype = dtype is not None
    estimator = 'check_pairwise_arrays'
    if dtype is None:
        dtype = dtype_float
    
    ##Input validation on an array, list, sparse matrix or similar.
    ##By default, the input is checked to be a non-empty 2D array containing only finite values.
    
    if Y is X or Y is None:
        X = Y = validation.check_array(X, accept_sparse='csr', dtype=dtype,
                            warn_on_dtype=warn_on_dtype, estimator=estimator)
    else:
        X = validation.check_array(X, accept_sparse='csr', dtype=dtype,
                        warn_on_dtype=warn_on_dtype, estimator=estimator)
        Y = validation.check_array(Y, accept_sparse='csr', dtype=dtype,
                        warn_on_dtype=warn_on_dtype, estimator=estimator)

    if precomputed:
        if X.shape[1] != Y.shape[0]:
            raise ValueError("Precomputed metric requires shape "
                             "(n_queries, n_indexed). Got (%d, %d) "
                             "for %d indexed." %
                             (X.shape[0], X.shape[1], Y.shape[0]))
    elif X.shape[1] != Y.shape[1]:
        raise ValueError("Incompatible dimension for X and Y matrices: "
                         "X.shape[1] == %d while Y.shape[1] == %d" % (
                             X.shape[1], Y.shape[1]))

    return X, Y

time: 27.1 ms


In [16]:
#check_pairwise_arrays(X=x_train_float, Y=y_train_float, precomputed=True)

time: 431 µs


In [17]:
def gower_distances(X, Y=None, w=None, categorical_features=None):
    """
    Computes the gower distances between X and Y

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)

    Y : array-like, shape (n_samples, n_features)

    w:  array-like, shape (n_features)
    According the Gower formula, w is an attribute weight.

    categorical_features: array-like, shape (n_features)
    Indicates with True/False wheter a column is a categorical attribute.
    This is useful when categorical atributes are represented as integer
    values.

    Returns
    -------
    similarities : ndarray, shape (n_samples, )

    Notes
    ------
    Gower is a similarity measure for categorical, boolean and numerical mixed
    data.

    """

    X, Y = check_pairwise_arrays(X, Y, dtype=(np.object, None)[issparse(X) or
                                                               issparse(Y)])
    rows, cols = X.shape

    if categorical_features is None:
        categorical_features = []
        for col in range(cols):
            if np.issubdtype(type(X[0, col]), np.number):
                categorical_features.append(False)
            else:
                categorical_features.append(True)
    # Calculates the normalized ranges and max values of numeric values
    ranges_of_numeric = [0.0] * cols
    max_of_numeric = [0.0] * cols
    for col in range(cols):
        if not categorical_features[col]:
            max = None
            min = None
            if issparse(X):
                col_array = X.getcol(col)
                max = col_array.max() + 0.0
                min = col_array.min() + 0.0
            else:
                col_array = X[:, col].astype(np.double)
                max = np.nanmax(col_array)
                min = np.nanmin(col_array)

            if np.isnan(max):
                max = 0.0
            if np.isnan(min):
                min = 0.0
            max_of_numeric[col] = max
            ranges_of_numeric[col] = (1 - min / max) if (max != 0) else 0.0

    if w is None:
        w = [1] * cols

    yrows, ycols = Y.shape

    dm = np.zeros((rows, yrows), dtype=np.double)

    for i in range(0, rows):
        j_start = i

        # for non square results
        if rows != yrows:
            j_start = 0

        for j in range(j_start, yrows):
            sum_sij = 0.0
            sum_wij = 0.0
            for col in range(cols):
                value_xi = X[i, col]
                value_xj = Y[j, col]

                if not categorical_features[col]:
                    if (max_of_numeric[col] != 0):
                        value_xi = value_xi / max_of_numeric[col]
                        value_xj = value_xj / max_of_numeric[col]
                    else:
                        value_xi = 0
                        value_xj = 0

                    if ranges_of_numeric[col] != 0:
                        sij = abs(value_xi - value_xj) / ranges_of_numeric[col]
                    else:
                        sij = 0
                    wij = (w[col], 0)[np.isnan(value_xi) or np.isnan(value_xj)]
                else:
                    sij = (1.0, 0.0)[value_xi == value_xj]
                    wij = (w[col], 0)[value_xi is None and value_xj is None]
                sum_sij += (wij * sij)
                sum_wij += wij

            if sum_wij != 0:
                dm[i, j] = (sum_sij / sum_wij)
                if j < rows and i < yrows:
                    dm[j, i] = dm[i, j]
    return dm


time: 240 ms


In [18]:
# ##testing grower distance
# x_train1=x_train.iloc[:100,1:20]
# x_train2=x_train.iloc[101:201,1:20]
# print(len(x_train1), #2715
# len(x_train2)) #2715

time: 1.26 ms


In [19]:
# gower_distances(X=x_train1, Y=x_train2, w=None, categorical_features=None) #works

time: 431 µs


In [20]:
def select_train_samples(sample_id, all_xy, m, time_interval):# m is number of similar cases or controls
    num_control = m   # the ratio of case and control is 1:2, 1:3,1:4
    
    ####not sure what this is doing.
    if time_interval == 24:
        top_con_variables = [False]*128
        mid_cat_variables = [True]*5
        age_variable = [False]
        next_cat_variables = [True]*10
        last_con_variables = [False]*2

        flag_cate_fea = top_con_variables + mid_cat_variables + age_variable + next_cat_variables + last_con_variables # 24,48, ...., Note that, the length of 24h  is different from other hours  in terms of columns
    else:
        top_con_variables = [False]*129  #there is another item in other hours
        mid_cat_variables = [True]*5
        age_variable = [False]
        next_cat_variables = [True]*10
        last_con_variables = [False]*2

        flag_cate_fea = top_con_variables + mid_cat_variables + age_variable + next_cat_variables + last_con_variables # 24,48, ...., Note that, the length of 24h  is different from other hours  in terms of columns
        
    ##all_xy = all_xy.fillna(np.nan) # fill empty with nan

    x_candidate_label = all_xy.loc[sample_id] # get the object sample
    x_candidate = x_candidate_label.drop('label')
    x_candidate_tem = x_candidate.as_matrix()
    testing_sample = x_candidate_tem.reshape(1, -1)  # covert into ....

    all_x_candidate_tem = all_xy.drop([sample_id], axis=0, inplace=False) # delete the object sample from whole set

# select similar cases
    all_cases = all_x_candidate_tem[all_x_candidate_tem.label == 1]
    all_cases_candidate = all_cases.drop(['label'], axis=1, inplace=False)
    gower_candidate_case = all_cases_candidate.values[:, :] # convert into ndarray

    Gower_Distance_1 = gower_distances(gower_candidate_case, testing_sample, categorical_features = flag_cate_fea) # Gower_Distance_1 is ndarray
    Gower_Distance_2 = list(Gower_Distance_1)
    Gower_Distance_3 = pd.Series(Gower_Distance_2, index = all_cases_candidate.index)
    Gower_Distance_4 = Gower_Distance_3.sort_values(ascending=False)

    Id_selected_cases = Gower_Distance_4.index[:m].tolist() # the id set of the top m similar samples

# select similar controls
    all_controls = all_x_candidate_tem[all_x_candidate_tem.label == 0]
    all_controls_candidate = all_controls.drop(['label'], axis=1, inplace=False)
    gower_candidate_control = all_controls_candidate.values[:, :] # convert into ndarray

    Gower_Distance_11 = gower_distances(gower_candidate_control, testing_sample,categorical_features = flag_cate_fea) # Gower_Distance_1 is ndarray
    Gower_Distance_22 = list(Gower_Distance_11)
    Gower_Distance_33 = pd.Series(Gower_Distance_22, index = all_controls_candidate.index)
    Gower_Distance_44 = Gower_Distance_33.sort_values(ascending=False)

    Id_selected_controls = Gower_Distance_44.index[:num_control].tolist() # the id set of the top m similar samples

    train_set_id = Id_selected_controls+Id_selected_cases

    train_set_id = np.array(train_set_id)
    return train_set_id

time: 110 ms


individualization_predictor.py:

In [21]:

#folder = '/Users/xuzhenxing/Documents/mimic_AKI_data/real_time_prediction/features/all/dropped/xy'
# folder = './xy'


def preprocessing(folder, time_interval, isnormalized=True):
    """Data preprocessing, Preprocessing  missing data with mean imputation; Normalize continous feature with MinMaxScaler;
    Normalize categorical feature with OneHotEncoder.

    Args:
        folder: dir path of source data;
        time_interval: interval of time, can be 24,48,72,96,120,144.
    Returns:
        x: features
        y: lables

    """

    all_xy = pd.read_csv(os.path.join(folder, 'all_{}hours_test_individualization_1thousand.csv'.format(time_interval)), index_col=0)
    # print (all_xy.shape)
    # print (all_xy.columns)

    medi = ['diuretics', 'nsaid', 'radio', 'angiotensin']
    pat = ['gender', 'age', 'ethnicity']
    # Total 9 comorbidity
    comm = ['congestive_heart_failure', 'peripheral_vascular', 'hypertension',
            'diabetes', 'liver_disease', 'mi', 'cad', 'cirrhosis', 'jaundice']

    # Total 8 chartevents
    chart = ['DiasBP_min', 'DiasBP_max', 'DiasBP_first', 'DiasBP_last', 'DiasBP_slope', 'DiasBP_avg',
             'Glucose_min', 'Glucose_max', 'Glucose_first', 'Glucose_last', 'Glucose_slope', 'Glucose_avg',
             'HeartRate_min', 'HeartRate_max', 'HeartRate_first', 'HeartRate_last', 'HeartRate_slope', 'HeartRate_avg',
             'MeanBP_min', 'MeanBP_max', 'MeanBP_first', 'MeanBP_last', 'MeanBP_slope', 'MeanBP_avg',
             'RespRate_min', 'RespRate_max', 'RespRate_first', 'RespRate_last', 'RespRate_slope', 'RespRate_avg',
             'SpO2_min', 'SpO2_max', 'SpO2_first', 'SpO2_last', 'SpO2_slope', 'SpO2_avg',
             'SysBP_min', 'SysBP_max', 'SysBP_first', 'SysBP_last', 'SysBP_slope', 'SysBP_avg',
             'Temp_min', 'Temp_max', 'Temp_first', 'Temp_last', 'Temp_slope', 'Temp_avg']

    # Total 12 labvents
    lab = ['BICARBONATE_first', 'BICARBONATE_last', 'BICARBONATE_min', 'BICARBONATE_max', 'BICARBONATE_avg',
           'BICARBONATE_slope', 'BICARBONATE_count',
           'BUN_first', 'BUN_last', 'BUN_min', 'BUN_max', 'BUN_avg', 'BUN_slope', 'BUN_count',
           'CHLORIDE_first', 'CHLORIDE_last', 'CHLORIDE_min', 'CHLORIDE_max', 'CHLORIDE_avg', 'CHLORIDE_slope',
           'CHLORIDE_count',
           'CREATININE_first', 'CREATININE_last', 'CREATININE_min', 'CREATININE_max', 'CREATININE_avg',
           'CREATININE_slope', 'CREATININE_count',
           'HEMOGLOBIN_first', 'HEMOGLOBIN_last', 'HEMOGLOBIN_min', 'HEMOGLOBIN_max', 'HEMOGLOBIN_avg',
           'HEMOGLOBIN_slope', 'HEMOGLOBIN_count',
           'INR_first', 'INR_last', 'INR_min', 'INR_max', 'INR_avg', 'INR_count',
           'PLATELET_first', 'PLATELET_last', 'PLATELET_min', 'PLATELET_max', 'PLATELET_avg', 'PLATELET_slope',
           'PLATELET_count',
           'POTASSIUM_first', 'POTASSIUM_last', 'POTASSIUM_min', 'POTASSIUM_max', 'POTASSIUM_avg', 'POTASSIUM_slope',
           'POTASSIUM_count',
           'PT_first', 'PT_last', 'PT_min', 'PT_max', 'PT_avg', 'PT_count',
           'PTT_first', 'PTT_last', 'PTT_min', 'PTT_max', 'PTT_avg', 'PTT_count',
           'WBC_first', 'WBC_last', 'WBC_min', 'WBC_max', 'WBC_avg', 'WBC_slope', 'WBC_count',
           'CALCIUM_first', 'CALCIUM_last', 'CALCIUM_min', 'CALCIUM_max', 'CALCIUM_avg', 'CALCIUM_count'
           ]

    if time_interval != 24:  # The 24h data lack of the feature 'CALCIUM_slope'
        lab.append('CALCIUM_slope')
    subset = medi + pat + comm + ['avg_urine'] + ['egfr_min'] + ['label'] # note that ['avg_urine'] + ['egfr_min'] is important, ignoring if they are empty.

    all_xy = all_xy.dropna(subset=subset)

    # print ('after dropping nan in the catergorical variables, the shape is {}'.format(all_xy.shape))

    all_conti_x = all_xy[chart + lab + ['avg_urine'] + ['egfr_min'] + ['age']]
    # print (all_conti_x.shape)
    # print (all_conti_x)
    all_categ_x = all_xy[['gender'] + ['ethnicity'] + medi + comm]
    # print (all_categ_x.shape)
    # print (all_categ_x)

    # Using mean imputer after drop the nan data in medication, patient demographic data, avg_ureine, egfr_min and label
    imp = Imputer(strategy='mean', axis=0)
    all_conti_x_fitted = imp.fit_transform(all_conti_x)

    def normalize(all_conti_x_fitted, all_categ_x):
        # using the MinMaxScaler to normalization the all_x
        min_max_scaler = MinMaxScaler()
        all_conti_x_fitted = min_max_scaler.fit_transform(all_conti_x_fitted)
        # print (all_conti_x_fitted.shape, all_conti_x_fitted)
        # all_conti_x = DataFrame(all_conti_x_fitted, columns=all_conti_x.columns)
        # print (all_conti_x.shape)

        onehot_enc = OneHotEncoder(sparse=False)  # dense format
        all_categ_x_fitted = onehot_enc.fit_transform(all_categ_x)
        # print (all_categ_x_fitted.shape, all_categ_x_fitted)
        return all_conti_x_fitted, all_categ_x_fitted

    if isnormalized:
        all_conti_x_fitted, all_categ_x_fitted = normalize(all_conti_x_fitted, all_categ_x)

    x = np.hstack((all_conti_x_fitted, all_categ_x_fitted))
    # y = all_xy['label']
    # x = np.array(x)
    # y = np.array(y)
    # print (x.shape, y.shape)
    # return x, y
    y = all_xy['label']
    z_icustay_id = y.index
    x = np.array(x)
    y = np.array(y)
    z_icustay_id = np.array(z_icustay_id)

    print (x.shape, y.shape)
    return x, y, z_icustay_id, all_xy


time: 266 ms


In [22]:
def perf_model(pipe, param_grid, name, X_train, X_test,
               y_train, y_test, scoring, verbose=0):
    gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scoring, cv=5, n_jobs=-1, verbose=verbose)
    gs.fit(X_train, y_train)

    y_train_pred = gs.predict(X_train)
    y_test_pred = gs.predict(X_test)

    acc_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
    acc_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)

    fpr, tpr, _ = roc_curve(y_train, gs.predict_proba(X_train)[:, 1])
    auc_train = auc(fpr, tpr)

    fpr, tpr, _ = roc_curve(y_test, gs.predict_proba(X_test)[:, 1])
    auc_test = auc(fpr, tpr)

    confmat_train = confusion_matrix(y_true=y_train, y_pred=y_train_pred)
    confmat_test = confusion_matrix(y_true=y_test, y_pred=y_test_pred)

    print (' best parameter: ', gs.best_params_)
    print (' training acc:%.2f auc:%.2f ' % (acc_train, auc_train))
    print (' testing acc:%.2f auc:%.2f ' % (acc_test, auc_test))

    print (' train confusion matrix:\n', confmat_train)
    print (' testing confusion matrix:\n', confmat_test)
    print (' classification report:\n', classification_report(y_test, y_test_pred))

    train_report = np.array(precision_recall_fscore_support(y_train, y_train_pred))
    train_class1_report = train_report[:, 1]
    train_metrics = list(train_class1_report[:-1])
    train_metrics.extend([acc_train, auc_train])
    print ('training metrics: precision, recall, f1-score, acc, auc')
    print (train_metrics)

    test_report = np.array(precision_recall_fscore_support(y_test, y_test_pred))
    test_class1_report = test_report[:, 1]
    test_metrics = list(test_class1_report[:-1])
    test_metrics.extend([acc_test, auc_test])
    print ('test metrics: precision, recall, f1-score, acc, auc')
    print (test_metrics)

    return train_metrics, test_metrics
    """
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate (recall)")

    plt.plot(fpr, tpr, label="acc:%f auc:%f" % (acc_test, auc_test))
    plt.legend(loc="best")
    plt.show()
    plt.close()

    precision, recall, _ = precision_recall_curve(y_train, gs.predict_proba(X_train)[:,1])
    average_precision = average_precision_score(y_test, gs.predict_proba(X_test)[:,1])
    plt.xlabel("precision")
    plt.ylabel("recall")
    plt.step(precision, recall, where='post', label='AP={0:0.2f}'.format(average_precision))
    plt.legend(loc="best")
    plt.show()
    plt.close()
    """


time: 92.4 ms


In [23]:
def try_dbdt(X_train, X_test, y_train, y_test, scoring):
    gbm = GradientBoostingClassifier(learning_rate=0.05, n_estimators=120, min_samples_leaf=60,
                                     max_features=9, subsample=0.7, random_state=10)

    param_grid = {'max_depth': list(range(3, 14, 2)), 'min_samples_split': list(range(100, 801, 200))}
    train_metrics, test_metrics = perf_model(gbm, param_grid, 'GBDT', X_train, X_test, y_train, y_test, scoring, 0)
    return train_metrics, test_metrics

time: 5.36 ms


In [24]:
#issue im having is that 

def try_models_cross(X_train, X_test, y_train, y_test, scoring):#  select data cross 5 Fold
    # X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, stratify=Y, random_state=RANDOM_STATE)
    # """
    # print ('\n\nLinear Logistic Regression with L1 Penalty')
    # lgr_l1_train_metrics, lgr_l1_test_metrics = try_lgr_l1(X_train, X_test, y_train, y_test, scoring)
    #
    # print ('\n\nLinear Logistic Regression with L2 Penalty')
    # lgr_l2_train_metrics, lgr_l2_test_metrics = try_lgr_l2(X_train, X_test, y_train, y_test, scoring)
    #
    # print ('\n\nStochastic Gradient Descent')
    # Elastic_train_metrics, Elastic_test_metrics = try_sgd(X_train, X_test, y_train, y_test, scoring)
    #
    # print ('\n\nRandom Forest')
    # rf_train_metrics, rf_test_metrics = try_rf(X_train, X_test, y_train, y_test, scoring)
    # #
    print ('\n\nGradient Boosting Decision tree')
    xgboost_train_metrics, xgboost_test_metrics = try_dbdt(X_train, X_test, y_train, y_test, scoring)




time: 6.8 ms


In [25]:
#y_train.reshape(-1,1)


time: 422 µs


In [26]:
# #y: one hot encoding my y labels

# from sklearn.preprocessing import OneHotEncoder
# enc = OneHotEncoder(handle_unknown='ignore')
# y=y_train.reshape(-1,1)
# enc.fit(y)
# y=enc.transform(y).toarray()

time: 1.46 ms


In [27]:
# import warnings
# from sklearn.exceptions import DataConversionWarning
# warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.filterwarnings(action='ignore', category=DataConversionWarning)
# warnings.filterwarnings(action='ignore', category=DeprecationWarning)

time: 2.65 ms


# formatting my data to fit his scheme

In [67]:
x_train= train_data.copy()
icustay_id=x_train.pop('icustay_id')
y_train= x_train.pop("final_bin").values

time: 4.19 ms


In [102]:
x_train= train_data.iloc[:,[1,2,3,4,5,6,7,8,9,38,39,40,41]] ###drastically reducing my dataframe size to test algorithm
#x_train= train_data.iloc[:,[1,2,3,4,5]] ###drastically reducing my dataframe size to test algorithm
x=np.array(x_train).copy()

y=y_train.copy()

z_icustay_id= icustay_id.copy()#icustay_id.index.to_series()#np.array(icustay_id)
all_xy=x_train.set_index(z_icustay_id) #in dataframe > csv format, idk if this will be an issue. NEEDS TO HAVE ICUSTAY_ID AS INDEX
all_xy['label']=y_train #has the outcome annotated as label

skf = StratifiedKFold(n_splits=5) #Stratified K-Folds cross-validator
time_interval=4

time: 10.5 ms


In [97]:
x
#len(y) #5432

array([[ 0.06909452,  0.07601444, -0.5       ,  0.07744773,  0.04757099],
       [ 0.05640602, -0.16415031, -0.20751875,  0.02122094,  0.26397869],
       [-0.06836176, -0.25320238, -0.5       ,  0.1226662 ,  0.12599099],
       ...,
       [ 0.35024169,  2.64174705,  0.16096405,  0.04745763,  0.03246678],
       [-0.40528818, -0.25320238, -0.5       ,  0.05675854,  0.01663438],
       [-0.35541805, -0.16415031, -1.        ,  0.05058858,  0.0724392 ]])

time: 2.67 ms


# trying his code on just one cv split

In [103]:
#running this prior to modeling so i can test only last split
train_index=0
test_index=0
for train_index, test_index in skf.split(x, y):
    train_index=train_index
    test_index=test_index
X_train_0, X_test_0 = x[train_index], x[test_index] #assigning x_train and x_test sets within this cv fold
y_train_0, y_test_0 = y[train_index], y[test_index] #assigning y_train and y_test sets within this cv fold

time: 6.73 ms


In [104]:
x[train_index]
x[test_index]

array([[ 0.04319947,  0.14849202,  0.16096405, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.2318563 ,  0.14849202, -1.        , ...,  1.        ,
         1.        ,  0.        ],
       [-0.04987013, -0.07991428,  0.16096405, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 0.35024169,  2.64174705,  0.16096405, ...,  0.        ,
         1.        ,  0.        ],
       [-0.40528818, -0.25320238, -0.5       , ...,  0.        ,
         1.        ,  0.        ],
       [-0.35541805, -0.16415031, -1.        , ...,  0.        ,
         1.        ,  0.        ]])

time: 2.88 ms


In [105]:
# using non-individual predictor for classification

xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                            min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                            objective='binary:logistic', nthread=4, scale_pos_weight=0.5, seed=27) #dropping scale weight greatly increases classification performance. 0.5 gives 79%accuracy, 77% accuracy for class=1.
xgboost_random.fit(x[train_index], y[train_index])
y_pred_random = xgboost_random.predict(x[test_index])
y_proba_random = xgboost_random.predict_proba(x[test_index])[:,1]

#y_test_random = y[test_index]

print ('this is the result of non-individual predictor using xgboost:')
print ('the Accuracy is:',accuracy_score(y[test_index], y_pred_random))
print ('the classification_report:\n', classification_report(y[test_index], y_pred_random))
print ('the AUC is:', roc_auc_score(y[test_index], y_proba_random))

logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                            intercept_scaling=1, class_weight='balanced', random_state=None)
logreg_random.fit(X_train_0, y_train_0)
lr_y_pred_random = logreg_random.predict(X_test_0)
lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]
#y_test_random = y[test_index]


print ('this is the result of non-individual predictor using lr:')
print ('the Accuracy is:',accuracy_score(y_test_random, lr_y_pred_random))
print ('the classification_report: \n', classification_report(y_test_random, lr_y_pred_random))
print ('the AUC is:', roc_auc_score(y_test_random, lr_y_pred_proba_random))

this is the result of non-individual predictor using xgboost:
the Accuracy is: 1.0
the classification_report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the AUC is: 0.9999999999999999
this is the result of non-individual predictor using lr:
the Accuracy is: 1.0
the classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the AUC is: 1.0
time: 211 ms


In [85]:
sum(y_pred_random -y[test_index]) #this is an issue, why am i getting this model out of my training?

0

time: 2.34 ms


# ugh wtf, both of my models are predicting 100% accuracy.  

since i was getting good results, i wrapped it all in a function, maybe tinkered with .copy() and y_train, but all my code looks good above. 

i have played around with the xtrain, ytrrain, xtest, ytest

i've reduced the dimensions on xtrain... ><



    update: ok so i reduced xtrain down to only 5 variables and got 77% accuracy. this can't be right but it's good to know the classifier changed.

In [71]:
y[test_index]

array([1, 1, 1, ..., 0, 0, 0])

time: 2.27 ms


In [72]:
len(train_index)

4346

time: 1.82 ms


In [73]:
len(X_train_0) #4346
X_train_0

array([[ 2.00012000e+05,  6.90945218e-02,  7.60144407e-02, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.00014000e+05,  5.64060151e-02, -1.64150312e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 2.00033000e+05, -6.83617572e-02, -2.53202377e-01, ...,
         1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       ...,
       [ 2.79891000e+05,  1.36269065e-01,  6.31709361e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.79924000e+05,  2.80649839e-01,  1.83632914e+00, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.80048000e+05, -6.83617572e-02,  2.84054612e-01, ...,
         1.00000000e+00,  1.00000000e+00,  0.00000000e+00]])

time: 2.87 ms


In [65]:
y_train_0

array([0, 0, 1, ..., 0, 0, 0])

time: 1.78 ms


In [67]:
# y_train_0
# y_test_0

array([0, 0, 1, ..., 0, 0, 0])

time: 2.25 ms


In [49]:
print('%%%%%')
num_fold = 0
# for train_index, test_index in skf.split(x, y):
#     #train_index: the index of training samples within this cv split
#     #test_index: the index of test samples within this cv split

#     X_train_0, X_test_0 = x[train_index], x[test_index] #assigning x_train and x_test sets within this cv fold
#     y_train_0, y_test_0 = y[train_index], y[test_index] #assigning y_train and y_test sets within this cv fold

print('#####################')

num_fold = num_fold + 1
print('this is the results of the {} fold in 5 folds:'.format(num_fold)) 

print('the number of testing samples in this fold:', test_index.size)

train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr


%%%%%
#####################
this is the results of the 1 fold in 5 folds:
the number of testing samples in this fold: 1086
time: 12.7 ms


In [34]:
def single_split_training(m=250):
    
    x_train= train_data.copy()
    icustay_id=x_train.pop('icustay_id')
    y_train= x_train.pop("final_bin").values
    
    x_train= train_data.iloc[:,[1,2,3,4,5,6,7,8,9,38,39,40,41]].copy() ###drastically reducing my dataframe size to test algorithm
    x=np.array(x_train)

    y=y_train.copy()
    
    z_icustay_id= icustay_id#icustay_id.index.to_series()#np.array(icustay_id)
    all_xy=x_train.set_index(icustay_id) #in dataframe > csv format, idk if this will be an issue. NEEDS TO HAVE ICUSTAY_ID AS INDEX
    all_xy['label']=y_train #has the outcome annotated as label

    skf = StratifiedKFold(n_splits=5) #Stratified K-Folds cross-validator
    time_interval=4
    
    
    #######
    
    for train_index, test_index in skf.split(x, y):
        X_train_0, X_test_0 = x[train_index], x[test_index] #assigning x_train and x_test sets within this cv fold
        y_train_0, y_test_0 = y[train_index], y[test_index] #assigning y_train and y_test sets within this cv fold
    
    #######
    num_fold = 0
    num_fold = num_fold + 1 ##silly to keep but it's from the loop
    print('this is the results of the {} fold in 5 folds:'.format(num_fold)) 

    print('the number of testing samples in this fold:', test_index.size)

    train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
    test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

    xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
    xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

    lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
    lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

        ######

    
    indicator_time = 0 # the indicator
    for i, j in zip(test_z_icustay_id, test_index):  #looping through the zipped indicies of the test indicies/test icustay_id

        testing_sample_id = i #numerical index of first 1/2 of data ##??? this seems to be instead the    
        all_xy_0 = all_xy.loc[train_z_icustay_id] # select all TRAINING samples from  the current fold using icustay_id index
        all_xy_training = all_xy_0.append(all_xy.loc[i]) # append the current ith testing sample to the training set. 

        ###important parameter. was at 400, i changed to X
        m = m  # m is the number of similar cases or similar controls

        X_test_00 = x[j]
        y_test = y[j]

        X_test = X_test_00.reshape(1, -1)

        # print 'start selecting......'

        Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

        ix = np.isin(z_icustay_id, Id_train_set)
        Id_train_set_index = list(np.where(ix))

        # Id_train_set_index = np.argwhere(z_icustay_id == Id_train_set)

        X_train = x[Id_train_set_index]
        y_train = y[Id_train_set_index]

        # print 'start training......'

        # scoring = 'roc_auc'

    # xgboost

        xgboost_mod = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                      min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                      objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
        xgboost_mod.fit(X_train, y_train)
        xg_y_pred = xgboost_mod.predict(X_test)
        xg_y_pred_proba = xgboost_mod.predict_proba(X_test)[:,1]

        xg_one_fold_pred.append(xg_y_pred)
        xg_one_fold_proba.append(xg_y_pred_proba)

    # lr 

        logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                    intercept_scaling=1, class_weight='balanced', random_state=None)
        logreg.fit(X_train, y_train)
        lr_y_pred = logreg.predict(X_test)
        lr_y_pred_proba = logreg.predict_proba(X_test)[:,1]

        lr_one_fold_pred.append(lr_y_pred)
        lr_one_fold_proba.append(lr_y_pred_proba)

        indicator_time = indicator_time + 1
        # print 'the next testing sample and total samples:', indicator_time, test_index.size

    xg_y_individual_pred = np.array(xg_one_fold_pred)
    xg_y_individual_proba = np.array(xg_one_fold_proba)

    lr_y_individual_pred = np.array(lr_one_fold_pred)
    lr_y_individual_proba = np.array(lr_one_fold_proba)

    one_fold_y_test = y[test_index]

    print ('this is the result of individual predictor using xgboost:')
    print ('the acc of one fold:', accuracy_score(one_fold_y_test, xg_y_individual_pred))
    print ('the classification_report :', classification_report(one_fold_y_test, xg_y_individual_pred))
    print ('the auc of one fold:', roc_auc_score(one_fold_y_test, xg_y_individual_proba))

    print ('this is the result of individual predictor using lr:')
    print ('the acc of one fold:', accuracy_score(one_fold_y_test, lr_y_individual_pred))
    print ('the classification_report :', classification_report(one_fold_y_test, lr_y_individual_pred))
    print ('the auc of one fold:', roc_auc_score(one_fold_y_test, lr_y_individual_pred))


time: 328 ms


In [36]:
single_split_training(m=250)


this is the results of the 1 fold in 5 folds:
the number of testing samples in this fold: 1086
this is the result of individual predictor using xgboost:
the acc of one fold: 1.0
the classification_report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the auc of one fold: 1.0
this is the result of individual predictor using lr:
the acc of one fold: 1.0
the classification_report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the auc o

using m=150, took 9.1 min to run. same with 200...:
this is the result of individual predictor using xgboost:
the acc of one fold: 1.0
the classification_report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the auc of one fold: 1.0
this is the result of individual predictor using lr:
the acc of one fold: 1.0
the classification_report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the auc of one fold: 1.0

using m=100, took 10.5 min to run.:
this is the result of individual predictor using xgboost:
the acc of one fold: 1.0
the classification_report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the auc of one fold: 1.0
this is the result of individual predictor using lr:
the acc of one fold: 1.0
the classification_report :               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the auc of one fold: 1.0
time: 9min 2s

using m=400, took 10.5 min to run.:
this is the result of individual predictor using xgboost:
the acc of one fold: 0.567219152854512
the classification_report :               precision    recall  f1-score   support

           0       0.88      0.48      0.62       806
           1       0.35      0.81      0.49       280

   micro avg       0.57      0.57      0.57      1086
   macro avg       0.62      0.65      0.56      1086
weighted avg       0.74      0.57      0.59      1086

the auc of one fold: 0.7220932293512938
this is the result of individual predictor using lr:
the acc of one fold: 0.6233885819521179
the classification_report :               precision    recall  f1-score   support

           0       0.87      0.58      0.70       806
           1       0.38      0.75      0.51       280

   micro avg       0.62      0.62      0.62      1086
   macro avg       0.63      0.66      0.60      1086
weighted avg       0.74      0.62      0.65      1086

the auc of one fold: 0.6647022332506205

using m=250, took 9min to run.:
this is the result of individual predictor using xgboost:
the acc of one fold: 0.6390423572744015
the classification_report :               precision    recall  f1-score   support

           0       0.86      0.62      0.72       806
           1       0.39      0.71      0.50       280

   micro avg       0.64      0.64      0.64      1086
   macro avg       0.62      0.66      0.61      1086
weighted avg       0.74      0.64      0.66      1086

the auc of one fold: 0.7258995037220843
this is the result of individual predictor using lr:
the acc of one fold: 0.6712707182320442
the classification_report :               precision    recall  f1-score   support

           0       0.84      0.69      0.76       806
           1       0.41      0.62      0.49       280

   micro avg       0.67      0.67      0.67      1086
   macro avg       0.62      0.65      0.62      1086
weighted avg       0.73      0.67      0.69      1086

In [35]:
# using non-individual predictor for classification

xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                            min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                            objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) #dropping scale weight greatly increases classification performance. 0.5 gives 79%accuracy, 77% accuracy for class=1.
xgboost_random.fit(X_train_0, y_train_0)
y_pred_random = xgboost_random.predict(X_test_0)
y_proba_random = xgboost_random.predict_proba(X_test_0)[:,1]

y_test_random = y[test_index]

print ('this is the result of non-individual predictor using xgboost:')
print ('the Accuracy is:',accuracy_score(y_test_random, y_pred_random))
print ('the classification_report:\n', classification_report(y_test_random, y_pred_random))
print ('the AUC is:', roc_auc_score(y_test_random, y_proba_random))

logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                            intercept_scaling=1, class_weight='balanced', random_state=None)
logreg_random.fit(X_train_0, y_train_0)
lr_y_pred_random = logreg_random.predict(X_test_0)
lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]
#y_test_random = y[test_index]


print ('this is the result of non-individual predictor using lr:')
print ('the Accuracy is:',accuracy_score(y_test_random, lr_y_pred_random))
print ('the classification_report: \n', classification_report(y_test_random, lr_y_pred_random))
print ('the AUC is:', roc_auc_score(y_test_random, lr_y_pred_proba_random))

this is the result of non-individual predictor using xgboost:
the Accuracy is: 1.0
the classification_report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the AUC is: 1.0
this is the result of non-individual predictor using lr:
the Accuracy is: 1.0
the classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the AUC is: 1.0
time: 230 ms


this is the result of non-individual predictor using lr:
the Accuracy is: 1.0
the classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       806
           1       1.00      1.00      1.00       280

   micro avg       1.00      1.00      1.00      1086
   macro avg       1.00      1.00      1.00      1086
weighted avg       1.00      1.00      1.00      1086

the AUC is: 1.0
time: 34.3 ms


In [42]:
logreg_random.predict_proba(X_test_0)[:, 1]

array([9.99234101e-01, 9.99006320e-01, 9.99210966e-01, ...,
       6.67983969e-04, 6.04632349e-04, 5.07187275e-04])

time: 4.19 ms


In [40]:
logreg_random.predict(X_test_0)

array([1, 1, 1, ..., 0, 0, 0])

time: 2.42 ms


In [41]:
X_test_0

array([[ 0.04319947,  0.14849202,  0.16096405, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.2318563 ,  0.14849202, -1.        , ...,  1.        ,
         1.        ,  0.        ],
       [-0.04987013, -0.07991428,  0.16096405, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 0.35024169,  2.64174705,  0.16096405, ...,  0.        ,
         1.        ,  0.        ],
       [-0.40528818, -0.25320238, -0.5       , ...,  0.        ,
         1.        ,  0.        ],
       [-0.35541805, -0.16415031, -1.        , ...,  0.        ,
         1.        ,  0.        ]])

time: 2.45 ms


In [None]:
for time_interval in [4]:  #for his he used a bunch of different time_intervals. i may want to adopt this later on. ,48,72,96,120,144]:
    #x, y, z_icustay_id, all_xy = preprocessing(folder, time_interval)  # all_xy is for compute gower distance
    # x= [[1,3,4,5],[2,3,4,6],[1,3,5,8],[1,4,7,8]] ; x is numpy array, each item represents the value of feature
    # y = [1,0,1,1] ; y is label
    # z_icustay_id = [1234,345,678,991] ; is the id for each ICU stay
    # all_xy contains feature, label, and icustay_id, but, all_xy is csv format ##NEEDS TO HAVE ICUSTAY_ID AS INDEX

    
    ###formatting my data to fit his scheme
    x_train= x_train.iloc[:,[1,2,3,4,5,6,7,8,9,38,39,40,41]] ###drastically reducing my dataframe size to test algorithm
    x=np.array(x_train)
    #y: one hot encoding my y labels

#     from sklearn.preprocessing import OneHotEncoder
#     enc = OneHotEncoder(handle_unknown='ignore')
#     y=y_train.reshape(-1,1)
#     enc.fit(y)
#     y=enc.transform(y).toarray()

    y=y_train
    
    z_icustay_id= icustay_id#icustay_id.index.to_series()#np.array(icustay_id)
    all_xy=x_train.set_index(icustay_id) #in dataframe > csv format, idk if this will be an issue. NEEDS TO HAVE ICUSTAY_ID AS INDEX
    all_xy['label']=y_train #has the outcome annotated as label


    skf = StratifiedKFold(n_splits=5) #Stratified K-Folds cross-validator
    print('%%%%%')
    num_fold = 0
    for train_index, test_index in skf.split(x, y):
        #train_index: the index of training samples within this cv split
        #test_index: the index of test samples within this cv split
        
        print('***************')
        # print 'This is the '+ str(i)+' times result of '+str(n_fold)+' fold'
        X_train_0, X_test_0 = x[train_index], x[test_index] #assigning x_train and x_test sets within this cv fold
        y_train_0, y_test_0 = y[train_index], y[test_index] #assigning y_train and y_test sets within this cv fold

        print('#####################')

        num_fold = num_fold + 1
        print('this is the results of the {} fold in 5 folds:'.format(num_fold)) 

        print('the number of testing samples in this fold:', test_index.size)

        train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
        test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

        xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
        xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

        lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
        lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

        indicator_time = 0 # the indicator
        for i, j in zip(test_z_icustay_id, test_index):  #looping through the zipped indicies of the test indicies/test icustay_id

            testing_sample_id = i #numerical index of first 1/2 of data ##??? this seems to be instead the    
            all_xy_0 = all_xy.loc[train_z_icustay_id] # select all TRAINING samples from  the current fold using icustay_id index
            all_xy_training = all_xy_0.append(all_xy.loc[i]) # append the current ith testing sample to the training set. 
                       
            ###important parameter. was at 400, i changed to X
            m = 250  # m is the number of similar cases or similar controls

            X_test_00 = x[j]
            y_test = y[j]

            X_test = X_test_00.reshape(1, -1)

            # print 'start selecting......'

            Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

            ix = np.isin(z_icustay_id, Id_train_set)
            Id_train_set_index = list(np.where(ix))

            # Id_train_set_index = np.argwhere(z_icustay_id == Id_train_set)

            X_train = x[Id_train_set_index]
            y_train = y[Id_train_set_index]

            # print 'start training......'

            # scoring = 'roc_auc'

# xgboost

            xgboost_mod = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                          min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                          objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
            xgboost_mod.fit(X_train, y_train)
            xg_y_pred = xgboost_mod.predict(X_test)
            xg_y_pred_proba = xgboost_mod.predict_proba(X_test)[:,1]

            xg_one_fold_pred.append(xg_y_pred)
            xg_one_fold_proba.append(xg_y_pred_proba)

# lr 

            logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                        intercept_scaling=1, class_weight='balanced', random_state=None)
            logreg.fit(X_train, y_train)
            lr_y_pred = logreg.predict(X_test)
            lr_y_pred_proba = logreg.predict_proba(X_test)[:,1]

            lr_one_fold_pred.append(lr_y_pred)
            lr_one_fold_proba.append(lr_y_pred_proba)

            indicator_time = indicator_time + 1
            # print 'the next testing sample and total samples:', indicator_time, test_index.size

        xg_y_individual_pred = np.array(xg_one_fold_pred)
        xg_y_individual_proba = np.array(xg_one_fold_proba)

        lr_y_individual_pred = np.array(lr_one_fold_pred)
        lr_y_individual_proba = np.array(lr_one_fold_proba)

        one_fold_y_test = y[test_index]

        print ('this is the result of individual predictor using xgboost:')
        print ('the accuracy of one fold:', accuracy_score(one_fold_y_test, xg_y_individual_pred))
        print ('the classification_report: \n', classification_report(one_fold_y_test, xg_y_individual_pred))
        print ('the AUC of one fold:', roc_auc_score(one_fold_y_test, xg_y_individual_proba))

        print ('this is the result of individual predictor using lr:')
        print ('the accuracy of one fold:', accuracy_score(one_fold_y_test, lr_y_individual_pred))
        print ('the classification_report: \n', classification_report(one_fold_y_test, lr_y_individual_pred))
        print ('the AUC of one fold:', roc_auc_score(one_fold_y_test, lr_y_individual_pred))

# using non-individual predictor for classification

        xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                                    min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                    objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
        xgboost_random.fit(X_train_0, y_train_0)
        y_pred_random = xgboost_random.predict(X_test_0)
        y_proba_random = xgboost_random.predict_proba(X_test_0)[:,1]

        y_test_random = y[test_index]

        print ('this is the result of non-individual predictor using xgboost:')
        print ('the accuracy is:',accuracy_score(y_test_random, y_pred_random))
        print ('the classification_report: \n', classification_report(y_test_random, y_pred_random))
        print ('the AUC is:', roc_auc_score(y_test_random, y_proba_random))

        logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                    intercept_scaling=1, class_weight='balanced', random_state=None)
        logreg_random.fit(X_train_0, y_train_0)
        lr_y_pred_random = logreg_random.predict(X_test_0)
        lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]

        print ('this is the result of non-individual predictor using lr:')
        print ('the accuracy is:',accuracy_score(y_test_random, lr_y_pred_random))
        print ('the classification_report: \n', classification_report(y_test_random, lr_y_pred_random))
        print ('the AUC is:', roc_auc_score(y_test_random, lr_y_pred_proba_random))

%%%%%
***************
#####################
this is the results of the 1 fold in 5 folds:
the number of testing samples in this fold: 1087
this is the result of individual predictor using xgboost:
the acc of one fold: 0.641214351425943
the classification_report :               precision    recall  f1-score   support

           0       0.84      0.64      0.73       807
           1       0.38      0.64      0.48       280

   micro avg       0.64      0.64      0.64      1087
   macro avg       0.61      0.64      0.60      1087
weighted avg       0.72      0.64      0.66      1087

the auc of one fold: 0.6857983713931669
this is the result of individual predictor using lr:
the acc of one fold: 0.6669733210671573
the classification_report :               precision    recall  f1-score   support

           0       0.82      0.71      0.76       807
           1       0.39      0.55      0.46       280

   micro avg       0.67      0.67      0.67      1087
   macro avg       0.61      0

In [42]:
x

array([[ 0.07601444, -0.5       ,  0.07744773, ...,  0.        ,
         1.        ,  0.        ],
       [-0.16415031, -0.20751875,  0.02122094, ...,  0.        ,
         1.        ,  0.        ],
       [-0.25320238, -0.5       ,  0.1226662 , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 2.64174705,  0.16096405,  0.04745763, ...,  0.        ,
         1.        ,  0.        ],
       [-0.25320238, -0.5       ,  0.05675854, ...,  0.        ,
         1.        ,  0.        ],
       [-0.16415031, -1.        ,  0.05058858, ...,  0.        ,
         1.        ,  0.        ]])

time: 2.54 ms


In [69]:
y_train_0

array([0, 0, 1, ..., 0, 0, 0])

time: 2.07 ms


In [70]:
y_test_0

array([1, 1, 1, ..., 0, 0, 0])

time: 3.66 ms


In [None]:


# for i, j in zip(test_z_icustay_id, test_index):  #looping through the zipped indicies of training/test set for this cv fold. 
#     # i_index = np.where(test_z_icustay_id == i)
#     # tem_test_z_icustay_id = np.delete(test_z_icustay_id, i_index)
#     testing_sample_id = i #numerical index of first 1/2 of data

#     all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
#     all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

In [None]:
# i=3
# j=4304
# testing_sample_id = i #numerical index of first 1/2 of data

# all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
# all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

In [None]:
# Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)
# #testing_sample_id
# #all_xy_training
# #m= # m is the number of similar cases or similar controls
# #time_interval

In [None]:
# all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
# all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

In [184]:
# test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

time: 1.97 ms


# testing training  loop without looping

In [190]:
# #test_z_icustay_id = 0#z_icustay_id[test_index] # the icustay_id of samples in TESTING set from 5 fold

# train_index_tester=0
# test_index_tester=0
# for train_index, test_index in skf.split(x, y_train):
#     #print(train_index, test_index )
#     train_index_tester=train_index
#     test_index_tester=test_index
#     #test_z_icustay_id=  z_icustay_id[test_index] # the icustay_id of samples in TESTING set from 5 fold

    
#     #print(train_index) #the index of training samples within this cv split
#     #print(test_index)#the index of test samples within this cv split
#     #print(test_z_icustay_id)

time: 6.92 ms


In [191]:
# #works
# X_train_0, X_test_0 = x[train_index_tester], x[test_index_tester] #assigning x_train and x_test sets within this cv fold
# y_train_0, y_test_0 = y[train_index_tester], y[test_index_tester] #assigning y_train and y_test sets within this cv fold

# print('#####################')

# num_fold = num_fold + 1
# print('this is the results of the {} fold in 5 folds:'.format(num_fold)) 

# print('the number of testing samples in this fold:', test_index.size)

# train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
# test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

# xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
# xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

# lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
# lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

#####################
this is the results of the 2 fold in 5 folds:
the number of testing samples in this fold: 1086
time: 12.7 ms


In [194]:
# for i, j in zip(test_z_icustay_id, test_index_tester):  #looping through the zipped indicies of the test indicies/test icustay_id
#     print(i) #icustay_id
#     print(j) #index of icustay_id

278924
4298
278932
4299
279048
4303
279057
4304
279084
4305
279089
4306
279153
4310
279172
4311
279261
4317
279334
4322
279396
4323
279413
4325
279443
4331
279496
4335
279804
4354
279925
4360
280054
4362
280112
4363
280124
4364
280162
4365
280168
4366
280185
4367
280191
4368
280193
4369
280199
4370
280210
4371
280230
4372
280231
4373
280236
4374
280237
4375
280245
4376
280264
4377
280317
4378
280321
4379
280322
4380
280353
4381
280377
4382
280389
4383
280390
4384
280391
4385
280411
4386
280433
4387
280435
4388
280445
4389
280492
4390
280540
4391
280545
4392
280552
4393
280569
4394
280576
4395
280595
4396
280602
4397
280605
4398
280608
4399
280630
4400
280635
4401
280673
4402
280682
4403
280701
4404
280715
4405
280773
4406
280783
4407
280788
4408
280829
4409
280834
4410
280844
4411
280888
4412
280892
4413
280899
4414
280900
4415
280928
4416
280943
4417
280948
4418
280951
4419
280957
4420
280963
4421
280967
4422
280982
4423
281000
4424
281085
4425
281092
4426
281098
4427
281099
4428
2811

In [198]:
# all_xy.loc[test_index_tester]

Unnamed: 0,amax_bun,amax_creatinine,amax_daily_sofa,amax_heartrate,amax_meanartpress,amax_platelet,amax_ptt,amax_sysbp,amax_temperature,amin_bun,...,any_vasoactive_False,any_vasoactive_True,leukocyte_False,leukocyte_True,"pao2fio2Ratio_(0, 200]","pao2fio2Ratio_(200, 333]","pao2fio2Ratio_(333, 475]","pao2fio2Ratio_(475, 3000]",vent_recieved_False,vent_recieved_True
4298,0.043199,0.148492,0.160964,0.031318,0.032467,-0.018116,-0.026686,0.002437,-0.006575,-0.049870,...,1,0,1,0,0,0,0,1,1,0
4299,0.231856,0.148492,-1.000000,0.104827,0.059648,0.102000,-0.030164,0.061535,0.009804,0.136269,...,1,0,1,0,0,0,0,1,0,1
4303,-0.049870,-0.079914,0.160964,-0.018847,0.104986,0.037520,0.274991,0.032390,0.059787,-0.108854,...,1,0,0,1,1,0,0,0,0,1
4304,0.093070,0.000000,-0.207519,0.037873,0.108071,-0.007729,0.004263,0.122124,0.028005,0.069095,...,1,0,1,0,0,0,0,1,1,0
4305,0.267443,0.826712,0.660964,0.102193,0.079212,0.089090,0.018799,0.051457,0.043986,0.155882,...,0,1,0,1,0,1,0,0,0,1
4306,0.317313,0.347655,0.660964,0.117672,0.082546,0.037520,0.082133,0.075082,0.007768,0.253675,...,0,1,1,0,0,0,0,1,0,1
4310,-0.240028,-0.253202,0.160964,0.096860,0.038845,0.150447,-0.031331,0.030192,0.017923,-0.274114,...,1,0,0,1,0,0,0,1,1,0
4311,0.436722,0.284055,0.953445,0.157795,0.098730,-0.280688,0.102890,0.073184,0.082385,0.360513,...,0,1,0,1,1,0,0,0,0,1
4317,0.056406,0.148492,0.500000,0.074574,0.033752,-0.173527,-0.016447,0.011986,0.079290,-0.049870,...,1,0,1,0,1,0,0,0,1,0
4322,0.394093,1.836329,-1.000000,0.024623,0.169487,-0.025345,0.448032,0.089829,-0.000410,0.174374,...,1,0,1,0,0,0,0,1,1,0


time: 53.6 ms


In [203]:
# #duplicate
# # for i, j in zip(test_z_icustay_id, test_index):  #looping through the zipped indicies of the test indicies/test icustay_id

# testing_sample_id = i #numerical index of first 1/2 of data ##??? this seems to be instead the 
# all_xy_0 = all_xy.loc[train_z_icustay_id] # select all TRAINING samples from  the current fold

# print("I is {}".format(i))
# print("testing_sample_id is {}".format(testing_sample_id))
# print("all_xy_0 is {}".format(all_xy_0))

# ###unclear what's going on. it seems like we are individually taking the test icustay id index
# all_xy_training = all_xy_0.append(all_xy.loc[i]) # append the testing 

# print("all_xy_training is {}".format(all_xy_training))


I is 200012
testing_sample_id is 200012
all_xy_0 is             amax_bun  amax_creatinine  amax_daily_sofa  amax_heartrate  \
icustay_id                                                               
219346      0.299501         0.826712         0.660964        0.094159   
219360     -0.240028        -0.253202         0.160964        0.068747   
219373     -0.155149         0.000000        -1.000000        0.096860   
219384     -0.131174        -0.347655         0.292481        0.074574   
219385      0.403080         2.038366         0.160964        0.166536   
219391      0.260632         0.076014        -0.500000        0.037873   
219411      0.104422         0.148492         0.000000        0.053689   
219412     -0.087975         0.076014         0.403677        0.157795   
219441     -0.181044        -0.347655         0.000000        0.125134   
219481      0.174374        -0.079914        -0.500000        0.102193   
219551     -0.108854        -0.079914         0.160964      

In [204]:
# ###important parameter. was at 400, i changed to X
# m = 250  # m is the number of similar cases or similar controls

# X_test_00 = x[j]
# y_test = y[j]

# X_test = X_test_00.reshape(1, -1)

# # print 'start selecting......'

# ##good now have all these parameters for testing this 
# Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization  #

KeyError: "['label'] not found in axis"

time: 22.5 ms


In [None]:
    ###important parameter. was at 400, i changed to X
    m = 250  # m is the number of similar cases or similar controls

    X_test_00 = x[j]
    y_test = y[j]

    X_test = X_test_00.reshape(1, -1)

    # print 'start selecting......'

    Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

In [180]:
for i, j in zip(train_index_tester, test_index_tester):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [161]:
?skf.split(x, y_train)

time: 26.6 ms


[0;31mSignature:[0m [0mskf[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0mgroups[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generate indices to split data into training and test set.

Parameters
----------
X : array-like, shape (n_samples, n_features)
    Training data, where n_samples is the number of samples
    and n_features is the number of features.

    Note that providing ``y`` is sufficient to generate the splits and
    hence ``np.zeros(n_samples)`` may be used as a placeholder for
    ``X`` instead of actual training data.

y : array-like, shape (n_samples,)
    The target variable for supervised learning problems.
    Stratification is done based on the y labels.

groups : object
    Always ignored, exists for compatibility.

Yields
------
train : ndarray
    The training set indices for that split.

test : ndarray
    The testing set indices for that split.

Notes
-----
Randomized CV splitters

In [None]:
#his unannotated code

In [58]:
# # if __name__ == '__main__': #basically execute only if run as a script. i will unravel this so i can run it inline here

#ge:reading in a file when running as script

# path = './logs/individualization_24_1th.txt'
# f = open(path, 'a+')
# orig_stdout = sys.stdout
# sys.stdout = f



for time_interval in [24]:  # ,48,72,96,120,144]:
    x, y, z_icustay_id, all_xy = preprocessing(folder, time_interval)  # all_xy is for compute gower distance

    skf = StratifiedKFold(n_splits=5)
    print '%%%%%'
    num_fold = 0
    for train_index, test_index in skf.split(x, y):
        print '***************'
        # print 'This is the '+ str(i)+' times result of '+str(n_fold)+' fold'
        X_train_0, X_test_0 = x[train_index], x[test_index]
        y_train_0, y_test_0 = y[train_index], y[test_index]

        print '#####################'

        num_fold = num_fold + 1
        print 'this is the results of the %d fold in 5 folds:' %num_fold

        print 'the number of testing samples in this fold:', test_index.size

        train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
        test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

        xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
        xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

        lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
        lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

        indicator_time = 0 # the indicator
        for i, j in zip(test_z_icustay_id, test_index):
            # i_index = np.where(test_z_icustay_id == i)
            # tem_test_z_icustay_id = np.delete(test_z_icustay_id, i_index)
            testing_sample_id = i

            all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
            all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

            m = 400  # m is the number of similar cases or similar controls

            X_test_00 = x[j]
            y_test = y[j]

            X_test = X_test_00.reshape(1, -1)

            # print 'start selecting......'

            Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

            ix = np.isin(z_icustay_id, Id_train_set)
            Id_train_set_index = list(np.where(ix))

            # Id_train_set_index = np.argwhere(z_icustay_id == Id_train_set)

            X_train = x[Id_train_set_index]
            y_train = y[Id_train_set_index]

            # print 'start training......'

            # scoring = 'roc_auc'

# xgboost

            xgboost_mod = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                          min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                          objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
            xgboost_mod.fit(X_train, y_train)
            xg_y_pred = xgboost_mod.predict(X_test)
            xg_y_pred_proba = xgboost_mod.predict_proba(X_test)[:,1]

            xg_one_fold_pred.append(xg_y_pred)
            xg_one_fold_proba.append(xg_y_pred_proba)

# lr 

            logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                        intercept_scaling=1, class_weight='balanced', random_state=None)
            logreg.fit(X_train, y_train)
            lr_y_pred = logreg.predict(X_test)
            lr_y_pred_proba = logreg.predict_proba(X_test)[:,1]

            lr_one_fold_pred.append(lr_y_pred)
            lr_one_fold_proba.append(lr_y_pred_proba)

            indicator_time = indicator_time + 1
            # print 'the next testing sample and total samples:', indicator_time, test_index.size

        xg_y_individual_pred = np.array(xg_one_fold_pred)
        xg_y_individual_proba = np.array(xg_one_fold_proba)

        lr_y_individual_pred = np.array(lr_one_fold_pred)
        lr_y_individual_proba = np.array(lr_one_fold_proba)

        one_fold_y_test = y[test_index]

        print 'this is the result of individual predictor using xgboost:'
        print 'the acc of one fold:', accuracy_score(one_fold_y_test, xg_y_individual_pred)
        print 'the classification_report :', classification_report(one_fold_y_test, xg_y_individual_pred)
        print 'the auc of one fold:', roc_auc_score(one_fold_y_test, xg_y_individual_proba)

        print 'this is the result of individual predictor using lr:'
        print 'the acc of one fold:', accuracy_score(one_fold_y_test, lr_y_individual_pred)
        print 'the classification_report :', classification_report(one_fold_y_test, lr_y_individual_pred)
        print 'the auc of one fold:', roc_auc_score(one_fold_y_test, lr_y_individual_pred)

# using non-individual predictor for classification

        xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                                    min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                    objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
        xgboost_random.fit(X_train_0, y_train_0)
        y_pred_random = xgboost_random.predict(X_test_0)
        y_proba_random = xgboost_random.predict_proba(X_test_0)[:,1]

        y_test_random = y[test_index]

        print 'this is the result of non-individual predictor using xgboost:'
        print 'the acc is:',accuracy_score(y_test_random, y_pred_random)
        print 'the classification_report:', classification_report(y_test_random, y_pred_random)
        print 'the auc is:', roc_auc_score(y_test_random, y_proba_random)

        logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                    intercept_scaling=1, class_weight='balanced', random_state=None)
        logreg_random.fit(X_train_0, y_train_0)
        lr_y_pred_random = logreg_random.predict(X_test_0)
        lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]

        print 'this is the result of non-individual predictor using lr:'
        print 'the acc is:',accuracy_score(y_test_random, lr_y_pred_random)
        print 'the classification_report:', classification_report(y_test_random, lr_y_pred_random)
        print 'the auc is:', roc_auc_score(y_test_random, lr_y_pred_proba_random)

        # break

SyntaxError: Missing parentheses in call to 'print'. Did you mean print('%%%%%')? (<ipython-input-58-348e33b4eb37>, line 16)

In [None]:

# if __name__ == '__main__': #basically execute only if run as a script. i will undo this
#     path = './logs/individualization_24_1th.txt'
#     f = open(path, 'a+')
#     orig_stdout = sys.stdout
#     sys.stdout = f
#     for time_interval in [24]:  # ,48,72,96,120,144]:
#         x, y, z_icustay_id, all_xy = preprocessing(folder, time_interval)  # all_xy is for compute gower distance

#         skf = StratifiedKFold(n_splits=5)
#         print '%%%%%'
#         num_fold = 0
#         for train_index, test_index in skf.split(x, y):
#             print '***************'
#             # print 'This is the '+ str(i)+' times result of '+str(n_fold)+' fold'
#             X_train_0, X_test_0 = x[train_index], x[test_index]
#             y_train_0, y_test_0 = y[train_index], y[test_index]

#             print '#####################'

#             num_fold = num_fold + 1
#             print 'this is the results of the %d fold in 5 folds:' %num_fold

#             print 'the number of testing samples in this fold:', test_index.size

#             train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
#             test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

#             xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
#             xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

#             lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
#             lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

#             indicator_time = 0 # the indicator
#             for i, j in zip(test_z_icustay_id, test_index):
#                 # i_index = np.where(test_z_icustay_id == i)
#                 # tem_test_z_icustay_id = np.delete(test_z_icustay_id, i_index)
#                 testing_sample_id = i

#                 all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
#                 all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

#                 m = 400  # m is the number of similar cases or similar controls

#                 X_test_00 = x[j]
#                 y_test = y[j]

#                 X_test = X_test_00.reshape(1, -1)

#                 # print 'start selecting......'

#                 Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

#                 ix = np.isin(z_icustay_id, Id_train_set)
#                 Id_train_set_index = list(np.where(ix))

#                 # Id_train_set_index = np.argwhere(z_icustay_id == Id_train_set)

#                 X_train = x[Id_train_set_index]
#                 y_train = y[Id_train_set_index]

#                 # print 'start training......'

#                 # scoring = 'roc_auc'

# # xgboost

#                 xgboost_mod = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
#                               min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
#                               objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
#                 xgboost_mod.fit(X_train, y_train)
#                 xg_y_pred = xgboost_mod.predict(X_test)
#                 xg_y_pred_proba = xgboost_mod.predict_proba(X_test)[:,1]

#                 xg_one_fold_pred.append(xg_y_pred)
#                 xg_one_fold_proba.append(xg_y_pred_proba)

# # lr 

#                 logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
#                                             intercept_scaling=1, class_weight='balanced', random_state=None)
#                 logreg.fit(X_train, y_train)
#                 lr_y_pred = logreg.predict(X_test)
#                 lr_y_pred_proba = logreg.predict_proba(X_test)[:,1]

#                 lr_one_fold_pred.append(lr_y_pred)
#                 lr_one_fold_proba.append(lr_y_pred_proba)

#                 indicator_time = indicator_time + 1
#                 # print 'the next testing sample and total samples:', indicator_time, test_index.size

#             xg_y_individual_pred = np.array(xg_one_fold_pred)
#             xg_y_individual_proba = np.array(xg_one_fold_proba)

#             lr_y_individual_pred = np.array(lr_one_fold_pred)
#             lr_y_individual_proba = np.array(lr_one_fold_proba)

#             one_fold_y_test = y[test_index]

#             print 'this is the result of individual predictor using xgboost:'
#             print 'the acc of one fold:', accuracy_score(one_fold_y_test, xg_y_individual_pred)
#             print 'the classification_report :', classification_report(one_fold_y_test, xg_y_individual_pred)
#             print 'the auc of one fold:', roc_auc_score(one_fold_y_test, xg_y_individual_proba)

#             print 'this is the result of individual predictor using lr:'
#             print 'the acc of one fold:', accuracy_score(one_fold_y_test, lr_y_individual_pred)
#             print 'the classification_report :', classification_report(one_fold_y_test, lr_y_individual_pred)
#             print 'the auc of one fold:', roc_auc_score(one_fold_y_test, lr_y_individual_pred)

# # using non-individual predictor for classification

#             xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
#                                         min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
#                                         objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
#             xgboost_random.fit(X_train_0, y_train_0)
#             y_pred_random = xgboost_random.predict(X_test_0)
#             y_proba_random = xgboost_random.predict_proba(X_test_0)[:,1]

#             y_test_random = y[test_index]

#             print 'this is the result of non-individual predictor using xgboost:'
#             print 'the acc is:',accuracy_score(y_test_random, y_pred_random)
#             print 'the classification_report:', classification_report(y_test_random, y_pred_random)
#             print 'the auc is:', roc_auc_score(y_test_random, y_proba_random)

#             logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
#                                         intercept_scaling=1, class_weight='balanced', random_state=None)
#             logreg_random.fit(X_train_0, y_train_0)
#             lr_y_pred_random = logreg_random.predict(X_test_0)
#             lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]

#             print 'this is the result of non-individual predictor using lr:'
#             print 'the acc is:',accuracy_score(y_test_random, lr_y_pred_random)
#             print 'the classification_report:', classification_report(y_test_random, lr_y_pred_random)
#             print 'the auc is:', roc_auc_score(y_test_random, lr_y_pred_proba_random)

#             # break
#     sys.stdout = orig_stdout
#     f.close()

## need to have a robust evaluation of model performance


## need to try cross validation