# description

sklearn modeling of the median imputed training data. note the preprocessing of data from 07.20-worst_case_model was performed in R (09.newagg2_preprocessing_med_impute.rmd). this eventually will be converted over to python, but for now works in r. 

preprocessing includes variable formatting (categorical to factor variables in r, train/test split, and median imputation).

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import numpy as np
import glob
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.externals.joblib import Memory
from sklearn.metrics import classification_report
memory = Memory(cachedir='/tmp', verbose=0)
#@memory.cache above any def fxn.

%matplotlib inline
plt.style.use('ggplot')

from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
        'width': 1024,
        'height': 768,
        'scroll': True,
})

%load_ext autotime

You provided "cachedir='/tmp'", use "location='/tmp'" instead.
  del sys.path[0]


In [2]:
os.chdir('/Users/geickelb1/Documents/GitHub/mimiciii-antibiotics-modeling') #use to change working directory
wd= os.getcwd() #'/Users/geickelb1/Documents/GitHub/mimiciii-antibiotics-modeling'

date="04042019"
final_pt_df2 = pd.read_csv(Path(wd + '/data/raw/csv/04042019_final_pt_df2_v.csv') , index_col=0) #only for patients with minimum vitals
patients= list(final_pt_df2['subject_id'].unique())
hadm_id= list(final_pt_df2['hadm_id'].unique())
icustay_id= list(final_pt_df2['icustay_id'].unique())
icustay_id= [int(x) for x in icustay_id]

time: 129 ms


In [3]:
train_data= pd.read_csv("/Users/geickelb1/Documents/GitHub/mimiciii-antibiotics-modeling/models/imputation/04042019_newagg2_median_imputed_train.csv") #two class training data

time: 91.9 ms


# light data reformatting for model

### most data are already converted to median type zscores, however weight and admit age still need to be converted.

In [4]:
weight_median=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","weight"]+1).median()
weight_quant1=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","weight"]+1).quantile(0.25)#.between(train_data['col'].quantile(.25), df['col'].quantile(.75), inclusive=True)]
weight_quant3=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","weight"]+1).quantile(0.75)
weight_iqr=weight_quant3-weight_quant1; weight_iqr
print(weight_median,weight_quant3,weight_quant1, weight_iqr)

4.356708826689592 4.499809670330265 4.200204952921578 0.29960471740868666
time: 13 ms


In [5]:
age_median=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","first_admit_age"]+1).median()
age_quant1=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","first_admit_age"]+1).quantile(0.25)
age_quant3=np.log(train_data.loc[train_data['final_bin']=="C_neg/A_partial","first_admit_age"]+1).quantile(0.75)
age_iqr=age_quant3-age_quant1;
print(age_median,age_quant3,age_quant1, age_iqr)

4.194943760778217 4.367991089683742 3.9691119690666907 0.39887912061705144
time: 37.4 ms


In [6]:
#converting to log scaled standardized data for age/weight
train_data['weight']=train_data['weight'].apply(lambda x: (np.log(x+1)-weight_median)/weight_iqr)
train_data['first_admit_age']=train_data['first_admit_age'].apply(lambda x: (np.log(x+1)-age_median)/age_iqr)

time: 21.4 ms


### onehot encoding categorical var

In [7]:
cols_to_transform=['any_vasoactive', 'leukocyte', 'pao2fio2Ratio', 'vent_recieved']
train_data = pd.get_dummies(train_data, columns = cols_to_transform )
train_data

Unnamed: 0,icustay_id,amax_bun,amax_creatinine,amax_daily_sofa,amax_heartrate,amax_meanartpress,amax_platelet,amax_ptt,amax_sysbp,amax_temperature,...,any_vasoactive_False,any_vasoactive_True,leukocyte_False,leukocyte_True,"pao2fio2Ratio_(0, 200]","pao2fio2Ratio_(200, 333]","pao2fio2Ratio_(333, 475]","pao2fio2Ratio_(475, 3000]",vent_recieved_False,vent_recieved_True
0,200012,0.069095,0.076014,-0.500000,0.077448,0.047571,-0.076639,0.012640,-0.022901,0.021964,...,1,0,1,0,0,0,0,1,1,0
1,200014,0.056406,-0.164150,-0.207519,0.021221,0.263979,-0.067398,-0.030164,0.118889,0.003685,...,1,0,1,0,0,1,0,0,0,1
2,200033,-0.068362,-0.253202,-0.500000,0.122666,0.125991,-0.061462,-0.034854,0.084386,0.061749,...,0,1,1,0,0,0,0,1,0,1
3,200036,0.136269,0.000000,-0.500000,0.132424,0.153843,-0.034114,0.105303,0.078839,0.015897,...,1,0,1,0,0,0,0,1,1,0
4,200059,0.287056,0.347655,0.403677,0.127583,0.196365,0.085552,0.521840,0.139305,0.018327,...,0,1,1,0,0,1,0,0,0,1
5,200063,0.311484,0.780201,0.403677,0.085914,0.137417,0.004289,-0.088158,0.080700,0.103102,...,1,0,1,0,0,0,0,1,0,1
6,200078,0.015050,0.148492,0.160964,0.120179,0.036308,0.137576,0.015733,0.002437,0.040008,...,0,1,1,0,0,0,0,1,1,0
7,200091,0.125998,0.000000,0.292481,0.034613,0.145733,-0.189289,0.057380,0.078839,0.034019,...,1,0,1,0,0,0,1,0,0,1
8,200099,0.146224,0.408760,0.160964,0.071674,0.101873,-0.052867,-0.055494,0.071275,0.025994,...,1,0,1,0,0,0,0,1,1,0
9,200108,0.104422,-0.164150,-0.207519,0.148804,0.156503,0.112900,0.028836,0.123729,0.028005,...,1,0,1,0,0,0,0,1,0,1


time: 104 ms


In [8]:
list(train_data)

['icustay_id',
 'amax_bun',
 'amax_creatinine',
 'amax_daily_sofa',
 'amax_heartrate',
 'amax_meanartpress',
 'amax_platelet',
 'amax_ptt',
 'amax_sysbp',
 'amax_temperature',
 'amin_bun',
 'amin_creatinine',
 'amin_daily_sofa',
 'amin_heartrate',
 'amin_meanartpress',
 'amin_platelet',
 'amin_ptt',
 'amin_sysbp',
 'amin_temperature',
 'median_bun',
 'median_creatinine',
 'median_daily_sofa',
 'median_heartrate',
 'median_meanartpress',
 'median_platelet',
 'median_ptt',
 'median_sysbp',
 'median_temperature',
 'std_bun',
 'std_creatinine',
 'std_daily_sofa',
 'std_heartrate',
 'std_meanartpress',
 'std_platelet',
 'std_ptt',
 'std_sysbp',
 'std_temperature',
 'first_admit_age',
 'weight',
 'final_bin',
 'any_vasoactive_False',
 'any_vasoactive_True',
 'leukocyte_False',
 'leukocyte_True',
 'pao2fio2Ratio_(0, 200]',
 'pao2fio2Ratio_(200, 333]',
 'pao2fio2Ratio_(333, 475]',
 'pao2fio2Ratio_(475, 3000]',
 'vent_recieved_False',
 'vent_recieved_True']

time: 1.99 ms


# binarizing outcome for training data

In [9]:
#binarizing and poping outcome for training data
train_data.loc[train_data['final_bin']=="C_pos/A_full","final_bin"]=1
train_data.loc[train_data['final_bin']=="C_neg/A_partial","final_bin"]=0
train_data['final_bin']=pd.to_numeric(train_data['final_bin'])


time: 10.8 ms


# building a sklearn pipeline
As the name suggests, pipeline class allows sticking multiple processes into a single scikit-learn estimator. pipeline class has fit, predict and score method just like any other estimator (ex. LinearRegression).

To implement pipeline, as usual we separate features and labels from the data-set at first.

In [10]:
x_train= train_data.copy()
icustay_id=x_train.pop('icustay_id')
y_train= x_train.pop("final_bin").values

time: 3.31 ms


if we needed our data to be scaled we would apply that here, but i've already done that.

In [11]:
# if we needed our data to be scaled we would apply that here, but i've already done that.
# from sklearn.preprocessing import StandardScaler

time: 704 µs


Now we are ready to create a pipeline object by providing with the list of steps. 

Here our steps are standard scalar and support vector machine. 

These steps are list of tuples consisting of name and an instance of the transformer or estimator.

In [12]:
# # steps = [('scaler', StandardScaler()), ('SVM', SVC())] #so step 1 is known as scaler, which performs StandardScaler() function on the input. 
# from sklearn.svm import SVC
# steps = [('SVM', SVC())] #removed step 1 since i already scaled my data
# from sklearn.pipeline import Pipeline
# pipeline = Pipeline(steps) # define the pipeline object.

time: 66.8 ms


In [15]:
# steps = [('scaler', StandardScaler()), ('SVM', SVC())] #so step 1 is known as scaler, which performs StandardScaler() function on the input. 
from sklearn.svm import SVC
steps = [('SVM', SVC(gamma="scale"))] #removed step 1 since i already scaled my data. added gamma=scale
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps) # define the pipeline object.

time: 3.46 ms


The strings (‘scaler’, ‘SVM’) can be anything, as these are just names to identify clearly the transform or estimator. We can use make_pipeline instead of Pipeline to avoid naming the estimator or transformer. The final step has to be an estimator in this list of tuples.

if we needed to do train/test split (which i've already done), we could use:

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=30, stratify=Y) #It’s necessary to use stratify as I’ve mentioned before that the labels are imbalanced as most of the wine quality falls in the range 5,6.

#### hypertuning:
SVM is usually optimized using two parameters gamma,C . I will discuss in an upcoming post on how they exactly work, but here let’s define a parameter grid that we will use in GridSearchCV .

In [22]:
parameteres = {'SVM__kernel':('linear', 'rbf'), 'SVM__C':[0.1, 1, 10]} #i think i need to include the SVM__  because i'm passing a pipeline object in.

time: 808 µs


Now we instantiate the GridSearchCV object with pipeline and the parameter space with 5 folds cross validation.


In [23]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5) #pipeline here is basically just adding the model. 

time: 1.3 ms


We can use this to fit on the training data-set and test the algorithm on the training set. Also we can find the best fit parameters for the SVM as below. 
## NOTE: i need to figure out how to extract cv misclass/ other loss parameter.

In [33]:
grid.fit(x_train, y_train)
#print("score = %3.2f") %(grid.score(x_test,y_test))\
print(grid.score(x_train,y_ty_trainest))
print(grid.best_params_)

NameError: name 'rid' is not defined

time: 33.5 s


In [31]:
# print("score = %s ") %(grid.score(x_train,y_train))
# print(grid.best_params_)

score = %s 


TypeError: unsupported operand type(s) for %: 'NoneType' and 'float'

time: 503 ms


In [37]:
print(grid.score(x_train,y_train))
print(grid.best_params_)

0.7816642120765832
{'SVM__C': 10, 'SVM__kernel': 'linear'}
time: 475 ms


In [39]:
grid.cv_results_



{'mean_fit_time': array([0.73107057, 0.79110894, 1.71803112, 0.84989548]),
 'mean_score_time': array([0.07952285, 0.13074217, 0.07977705, 0.11922569]),
 'mean_test_score': array([0.7757732 , 0.76859352, 0.78019146, 0.77632548]),
 'mean_train_score': array([0.77733802, 0.77264354, 0.78221657, 0.79768044]),
 'param_SVM__C': masked_array(data=[1, 1, 10, 10],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_SVM__kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'SVM__C': 1, 'SVM__kernel': 'linear'},
  {'SVM__C': 1, 'SVM__kernel': 'rbf'},
  {'SVM__C': 10, 'SVM__kernel': 'linear'},
  {'SVM__C': 10, 'SVM__kernel': 'rbf'}],
 'rank_test_score': array([3, 4, 1, 2], dtype=int32),
 'split0_test_score': array([0.76724931, 0.76172953, 0.7700092 , 0.77092916]),
 'split0_train_score': array([0.78066743, 0.7735328 , 0.

time: 8.51 ms


# local methods (trying functions written by postdoc)
Compute_Gower_Distance.py:

In [45]:
import os, sys
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.utils import validation
from sklearn.metrics import pairwise
from scipy.sparse import issparse

time: 2.41 ms


converting to floats

In [86]:
def _return_float_dtype(X, Y):
    ##used in grower distance, converts values to floats for formatting.
    """
    1. If dtype of X and Y is float32, then dtype float32 is returned.
    2. Else dtype float is returned.
    """
    if not issparse(X) and not isinstance(X, np.ndarray):
        X = np.asarray(X)

    if Y is None:
        Y_dtype = X.dtype
    elif not issparse(Y) and not isinstance(Y, np.ndarray):
        Y = np.asarray(Y)
        Y_dtype = Y.dtype
    else:
        Y_dtype = Y.dtype

    if X.dtype == Y_dtype == np.float32:
        dtype = np.float32
    elif X.dtype == np.object and not issparse(X):
        dtype = np.float
        for col in range(X.shape[1]):
            if not np.issubdtype(type(X[0, col]), np.number):
                dtype = np.object
                break
    else:
        dtype = np.float
    return X, Y, dtype


time: 20.7 ms


In [57]:
# x_train_float, y_train_float, dtype =_return_float_dtype(X=x_train, Y=y_train)

time: 2.53 ms


In [87]:
def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
    ##used in grower distance, checks x and y dimensions against each otehr.
    X, Y, dtype_float = _return_float_dtype(X, Y)

    warn_on_dtype = dtype is not None
    estimator = 'check_pairwise_arrays'
    if dtype is None:
        dtype = dtype_float
    
    ##Input validation on an array, list, sparse matrix or similar.
    ##By default, the input is checked to be a non-empty 2D array containing only finite values.
    
    if Y is X or Y is None:
        X = Y = validation.check_array(X, accept_sparse='csr', dtype=dtype,
                            warn_on_dtype=warn_on_dtype, estimator=estimator)
    else:
        X = validation.check_array(X, accept_sparse='csr', dtype=dtype,
                        warn_on_dtype=warn_on_dtype, estimator=estimator)
        Y = validation.check_array(Y, accept_sparse='csr', dtype=dtype,
                        warn_on_dtype=warn_on_dtype, estimator=estimator)

    if precomputed:
        if X.shape[1] != Y.shape[0]:
            raise ValueError("Precomputed metric requires shape "
                             "(n_queries, n_indexed). Got (%d, %d) "
                             "for %d indexed." %
                             (X.shape[0], X.shape[1], Y.shape[0]))
    elif X.shape[1] != Y.shape[1]:
        raise ValueError("Incompatible dimension for X and Y matrices: "
                         "X.shape[1] == %d while Y.shape[1] == %d" % (
                             X.shape[1], Y.shape[1]))

    return X, Y

time: 26.5 ms


In [69]:
#check_pairwise_arrays(X=x_train_float, Y=y_train_float, precomputed=True)

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 1. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

time: 10.7 ms


In [103]:
def gower_distances(X, Y=None, w=None, categorical_features=None):
    """
    Computes the gower distances between X and Y

    Read more in the :ref:`User Guide <metrics>`.

    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)

    Y : array-like, shape (n_samples, n_features)

    w:  array-like, shape (n_features)
    According the Gower formula, w is an attribute weight.

    categorical_features: array-like, shape (n_features)
    Indicates with True/False wheter a column is a categorical attribute.
    This is useful when categorical atributes are represented as integer
    values.

    Returns
    -------
    similarities : ndarray, shape (n_samples, )

    Notes
    ------
    Gower is a similarity measure for categorical, boolean and numerical mixed
    data.

    """

    X, Y = check_pairwise_arrays(X, Y, dtype=(np.object, None)[issparse(X) or
                                                               issparse(Y)])
    rows, cols = X.shape

    if categorical_features is None:
        categorical_features = []
        for col in range(cols):
            if np.issubdtype(type(X[0, col]), np.number):
                categorical_features.append(False)
            else:
                categorical_features.append(True)
    # Calculates the normalized ranges and max values of numeric values
    ranges_of_numeric = [0.0] * cols
    max_of_numeric = [0.0] * cols
    for col in range(cols):
        if not categorical_features[col]:
            max = None
            min = None
            if issparse(X):
                col_array = X.getcol(col)
                max = col_array.max() + 0.0
                min = col_array.min() + 0.0
            else:
                col_array = X[:, col].astype(np.double)
                max = np.nanmax(col_array)
                min = np.nanmin(col_array)

            if np.isnan(max):
                max = 0.0
            if np.isnan(min):
                min = 0.0
            max_of_numeric[col] = max
            ranges_of_numeric[col] = (1 - min / max) if (max != 0) else 0.0

    if w is None:
        w = [1] * cols

    yrows, ycols = Y.shape

    dm = np.zeros((rows, yrows), dtype=np.double)

    for i in range(0, rows):
        j_start = i

        # for non square results
        if rows != yrows:
            j_start = 0

        for j in range(j_start, yrows):
            sum_sij = 0.0
            sum_wij = 0.0
            for col in range(cols):
                value_xi = X[i, col]
                value_xj = Y[j, col]

                if not categorical_features[col]:
                    if (max_of_numeric[col] != 0):
                        value_xi = value_xi / max_of_numeric[col]
                        value_xj = value_xj / max_of_numeric[col]
                    else:
                        value_xi = 0
                        value_xj = 0

                    if ranges_of_numeric[col] != 0:
                        sij = abs(value_xi - value_xj) / ranges_of_numeric[col]
                    else:
                        sij = 0
                    wij = (w[col], 0)[np.isnan(value_xi) or np.isnan(value_xj)]
                else:
                    sij = (1.0, 0.0)[value_xi == value_xj]
                    wij = (w[col], 0)[value_xi is None and value_xj is None]
                sum_sij += (wij * sij)
                sum_wij += wij

            if sum_wij != 0:
                dm[i, j] = (sum_sij / sum_wij)
                if j < rows and i < yrows:
                    dm[j, i] = dm[i, j]
    return dm


time: 201 ms


In [111]:
##testing grower distance
x_train1=x_train.iloc[:100,1:20]
x_train2=x_train.iloc[101:201,1:20]
print(len(x_train1), #2715
len(x_train2)) #2715

100 100
time: 2.75 ms


In [108]:
x_train2

Unnamed: 0,amax_creatinine,amax_daily_sofa,amax_heartrate,amax_meanartpress,amax_platelet,amax_ptt,amax_sysbp,amax_temperature,amin_bun,amin_creatinine,amin_daily_sofa,amin_heartrate,amin_meanartpress,amin_platelet,amin_ptt,amin_sysbp,amin_temperature,median_bun,median_creatinine
2717,-0.079914,-0.500000,0.021221,-0.031305,0.168830,0.243009,-0.033688,0.045971,-0.049870,-0.164150,-0.500000,-0.102960,-0.110517,0.127943,-0.099264,-0.101429,-0.021066,-0.007892,-0.122032
2718,1.118385,0.160964,0.047458,0.069000,0.030172,-0.037222,0.063509,0.019945,-0.155149,0.631709,0.160964,-0.018847,-0.036032,0.003225,-0.050531,-0.017654,-0.021066,0.043199,1.040469
2719,-0.448205,0.160964,0.071674,0.101873,0.034795,0.012640,0.071275,0.025994,-0.405288,-0.448205,-0.500000,-0.064773,-0.104599,0.034795,-0.040804,-0.081338,-0.031509,-0.405288,-0.448205
2720,-0.347655,-0.500000,-0.003680,0.079212,0.104617,-0.099264,0.120511,0.021964,-0.131174,-0.448205,-1.000000,-0.203799,-0.076508,0.055641,-0.125573,-0.042049,-0.039920,-0.078168,-0.347655
2721,-0.347655,0.160964,0.071674,0.062011,0.150966,0.075231,0.067418,0.025592,-0.312219,-0.555695,-0.500000,-0.078556,-0.155231,0.138129,0.038602,-0.119300,-0.023148,-0.098414,-0.501950
2722,0.578891,0.584963,0.107438,0.098730,-0.075070,0.384208,0.084386,0.029210,0.069095,-0.079914,-1.000000,-0.129418,-0.050632,-0.108998,0.317555,-0.033688,-0.063315,0.093070,0.076014
2723,0.217747,-0.207519,-0.055909,0.092352,0.066104,-0.024385,0.041019,-0.002462,0.093070,0.076014,-0.207519,-0.123945,-0.141823,0.056465,-0.024385,-0.108447,-0.012767,0.165260,0.076014
2724,-0.079914,0.160964,0.056759,0.108071,0.027347,0.072609,0.038885,0.019945,-0.405288,-0.253202,0.000000,-0.129418,-0.176429,-0.007729,-0.009778,-0.146319,-0.050506,-0.355418,-0.164150
2725,1.762710,0.953445,0.234153,0.043860,-0.076639,0.481033,0.075082,0.102340,0.344981,0.915764,0.903677,0.024623,-0.135318,-0.183899,-0.018697,-0.176103,0.017923,0.360513,1.395769
2726,-0.164150,0.160964,0.191390,0.117153,0.085552,-0.063069,0.057545,0.038014,-0.155149,-0.347655,-0.500000,-0.047287,-0.344214,-0.021698,-0.063069,-0.101429,-0.050506,-0.073479,-0.300429


time: 42.1 ms


In [112]:
gower_distances(X=x_train1, Y=x_train2, w=None, categorical_features=None) #works



array([[0.24318248, 0.14126017, 0.11850909, ..., 0.16277388, 0.25359059,
        0.21829271],
       [0.14126017, 0.18324272, 0.15246607, ..., 0.16600347, 0.25083128,
        0.17877063],
       [0.11850909, 0.15246607, 0.17748913, ..., 0.19899093, 0.28713812,
        0.16931492],
       ...,
       [0.16277388, 0.16600347, 0.19899093, ..., 0.20421216, 0.24213328,
        0.17592173],
       [0.25359059, 0.25083128, 0.28713812, ..., 0.24213328, 0.28678125,
        0.16907204],
       [0.21829271, 0.17877063, 0.16931492, ..., 0.17592173, 0.16907204,
        0.15356587]])

time: 610 ms


In [None]:
# X2 = np.array([['Syria', 1200, 0, 411114.44, True],
#                ['Ireland', 300, 0, 199393333.22, False],
#                ['United Kingdom', 100, 0, 32323222.121, None]], dtype=object)
#
# Y2 = np.array([['United Kingdom', 200, 0, 99923921.47, True]], dtype=object)
#
# flag = [True,True,False,False,True]
#
# D = gower_distances(X2, Y2,categorical_features = flag)
#
# print D

# compute the gower distance for an example from AKI dataset

# folder = '/Users/xuzhenxing/Documents/mimic_AKI_data/real_time_prediction/features/all/dropped/x'
#
# time_interval = 24 # 24,48, ...., Note that, the length of 24h  is different from other hours  in terms of columns
#
# all_x = pd.read_csv(os.path.join(folder, 'all_{}hours.csv'.format(time_interval)), index_col=0)

# all_x = all_x.fillna(np.nan)
#
# for i in all_x.index:
# # i = 211552
#     A_x = all_x.loc[i]
#     print i
#
#     break
#
# candidate_set = all_x.values[:, :]
# testing_sample_0 = A_x.as_matrix()
# testing_sample = testing_sample_0.reshape(1,-1)

# if time_interval ==24:
#     flag_cate_fea = [True,False]  # 24,48, ...., Note that, the length of 24h  is different from other hours  in terms of columns
# else:

# D1 = gower_distances(candidate_set, testing_sample,categorical_features = flag_cate_fea)

# folder = '/Users/xuzhenxing/Documents/mimic_AKI_data/real_time_prediction/features/all/dropped/x'

In [44]:
def select_train_samples(sample_id, all_xy, m, time_interval):# m is number of similar cases or controls
    num_control = m   # the ratio of case and control is 1:2, 1:3,1:4
    if time_interval == 24:
        top_con_variables = [False]*128
        mid_cat_variables = [True]*5
        age_variable = [False]
        next_cat_variables = [True]*10
        last_con_variables = [False]*2

        flag_cate_fea = top_con_variables + mid_cat_variables + age_variable + next_cat_variables + last_con_variables # 24,48, ...., Note that, the length of 24h  is different from other hours  in terms of columns
    else:
        top_con_variables = [False]*129  #there is another item in other hours
        mid_cat_variables = [True]*5
        age_variable = [False]
        next_cat_variables = [True]*10
        last_con_variables = [False]*2

        flag_cate_fea = top_con_variables + mid_cat_variables + age_variable + next_cat_variables + last_con_variables # 24,48, ...., Note that, the length of 24h  is different from other hours  in terms of columns
        
    all_xy = all_xy.fillna(np.nan) # fill empty with nan

    x_candidate_label = all_xy.loc[sample_id] # get the object sample
    x_candidate = x_candidate_label.drop('label')
    x_candidate_tem = x_candidate.as_matrix()
    testing_sample = x_candidate_tem.reshape(1, -1)  # covert into ....

    all_x_candidate_tem = all_xy.drop([sample_id], axis=0, inplace=False) # delete the object sample from whole set

# select similar cases
    all_cases = all_x_candidate_tem[all_x_candidate_tem.label == 1]
    all_cases_candidate = all_cases.drop(['label'], axis=1, inplace=False)
    gower_candidate_case = all_cases_candidate.values[:, :] # convert into ndarray

    Gower_Distance_1 = gower_distances(gower_candidate_case, testing_sample, categorical_features = flag_cate_fea) # Gower_Distance_1 is ndarray
    Gower_Distance_2 = list(Gower_Distance_1)
    Gower_Distance_3 = pd.Series(Gower_Distance_2, index = all_cases_candidate.index)
    Gower_Distance_4 = Gower_Distance_3.sort_values(ascending=False)

    Id_selected_cases = Gower_Distance_4.index[:m].tolist() # the id set of the top m similar samples

# select similar controls
    all_controls = all_x_candidate_tem[all_x_candidate_tem.label == 0]
    all_controls_candidate = all_controls.drop(['label'], axis=1, inplace=False)
    gower_candidate_control = all_controls_candidate.values[:, :] # convert into ndarray

    Gower_Distance_11 = gower_distances(gower_candidate_control, testing_sample,categorical_features = flag_cate_fea) # Gower_Distance_1 is ndarray
    Gower_Distance_22 = list(Gower_Distance_11)
    Gower_Distance_33 = pd.Series(Gower_Distance_22, index = all_controls_candidate.index)
    Gower_Distance_44 = Gower_Distance_33.sort_values(ascending=False)

    Id_selected_controls = Gower_Distance_44.index[:num_control].tolist() # the id set of the top m similar samples

    train_set_id = Id_selected_controls+Id_selected_cases

    train_set_id = np.array(train_set_id)
    return train_set_id

time: 164 ms


individualization_predictor.py:

In [123]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, Imputer
from sklearn.model_selection import StratifiedKFold
#from Compute_gower_distance import select_train_samples
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, accuracy_score, auc, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier #conda install -c conda-forge xgboost to install


RANDOM_STATE = 15485867

time: 7.82 ms


In [None]:

folder = '/Users/xuzhenxing/Documents/mimic_AKI_data/real_time_prediction/features/all/dropped/xy'
# folder = './xy'


def preprocessing(folder, time_interval, isnormalized=True):
    """Data preprocessing, Preprocessing  missing data with mean imputation; Normalize continous feature with MinMaxScaler;
    Normalize categorical feature with OneHotEncoder.

    Args:
        folder: dir path of source data;
        time_interval: interval of time, can be 24,48,72,96,120,144.
    Returns:
        x: features
        y: lables

    """

    all_xy = pd.read_csv(os.path.join(folder, 'all_{}hours_test_individualization_1thousand.csv'.format(time_interval)), index_col=0)
    # print (all_xy.shape)
    # print (all_xy.columns)

    medi = ['diuretics', 'nsaid', 'radio', 'angiotensin']
    pat = ['gender', 'age', 'ethnicity']
    # Total 9 comorbidity
    comm = ['congestive_heart_failure', 'peripheral_vascular', 'hypertension',
            'diabetes', 'liver_disease', 'mi', 'cad', 'cirrhosis', 'jaundice']

    # Total 8 chartevents
    chart = ['DiasBP_min', 'DiasBP_max', 'DiasBP_first', 'DiasBP_last', 'DiasBP_slope', 'DiasBP_avg',
             'Glucose_min', 'Glucose_max', 'Glucose_first', 'Glucose_last', 'Glucose_slope', 'Glucose_avg',
             'HeartRate_min', 'HeartRate_max', 'HeartRate_first', 'HeartRate_last', 'HeartRate_slope', 'HeartRate_avg',
             'MeanBP_min', 'MeanBP_max', 'MeanBP_first', 'MeanBP_last', 'MeanBP_slope', 'MeanBP_avg',
             'RespRate_min', 'RespRate_max', 'RespRate_first', 'RespRate_last', 'RespRate_slope', 'RespRate_avg',
             'SpO2_min', 'SpO2_max', 'SpO2_first', 'SpO2_last', 'SpO2_slope', 'SpO2_avg',
             'SysBP_min', 'SysBP_max', 'SysBP_first', 'SysBP_last', 'SysBP_slope', 'SysBP_avg',
             'Temp_min', 'Temp_max', 'Temp_first', 'Temp_last', 'Temp_slope', 'Temp_avg']

    # Total 12 labvents
    lab = ['BICARBONATE_first', 'BICARBONATE_last', 'BICARBONATE_min', 'BICARBONATE_max', 'BICARBONATE_avg',
           'BICARBONATE_slope', 'BICARBONATE_count',
           'BUN_first', 'BUN_last', 'BUN_min', 'BUN_max', 'BUN_avg', 'BUN_slope', 'BUN_count',
           'CHLORIDE_first', 'CHLORIDE_last', 'CHLORIDE_min', 'CHLORIDE_max', 'CHLORIDE_avg', 'CHLORIDE_slope',
           'CHLORIDE_count',
           'CREATININE_first', 'CREATININE_last', 'CREATININE_min', 'CREATININE_max', 'CREATININE_avg',
           'CREATININE_slope', 'CREATININE_count',
           'HEMOGLOBIN_first', 'HEMOGLOBIN_last', 'HEMOGLOBIN_min', 'HEMOGLOBIN_max', 'HEMOGLOBIN_avg',
           'HEMOGLOBIN_slope', 'HEMOGLOBIN_count',
           'INR_first', 'INR_last', 'INR_min', 'INR_max', 'INR_avg', 'INR_count',
           'PLATELET_first', 'PLATELET_last', 'PLATELET_min', 'PLATELET_max', 'PLATELET_avg', 'PLATELET_slope',
           'PLATELET_count',
           'POTASSIUM_first', 'POTASSIUM_last', 'POTASSIUM_min', 'POTASSIUM_max', 'POTASSIUM_avg', 'POTASSIUM_slope',
           'POTASSIUM_count',
           'PT_first', 'PT_last', 'PT_min', 'PT_max', 'PT_avg', 'PT_count',
           'PTT_first', 'PTT_last', 'PTT_min', 'PTT_max', 'PTT_avg', 'PTT_count',
           'WBC_first', 'WBC_last', 'WBC_min', 'WBC_max', 'WBC_avg', 'WBC_slope', 'WBC_count',
           'CALCIUM_first', 'CALCIUM_last', 'CALCIUM_min', 'CALCIUM_max', 'CALCIUM_avg', 'CALCIUM_count'
           ]

    if time_interval != 24:  # The 24h data lack of the feature 'CALCIUM_slope'
        lab.append('CALCIUM_slope')
    subset = medi + pat + comm + ['avg_urine'] + ['egfr_min'] + ['label'] # note that ['avg_urine'] + ['egfr_min'] is important, ignoring if they are empty.

    all_xy = all_xy.dropna(subset=subset)

    # print ('after dropping nan in the catergorical variables, the shape is {}'.format(all_xy.shape))

    all_conti_x = all_xy[chart + lab + ['avg_urine'] + ['egfr_min'] + ['age']]
    # print (all_conti_x.shape)
    # print (all_conti_x)
    all_categ_x = all_xy[['gender'] + ['ethnicity'] + medi + comm]
    # print (all_categ_x.shape)
    # print (all_categ_x)

    # Using mean imputer after drop the nan data in medication, patient demographic data, avg_ureine, egfr_min and label
    imp = Imputer(strategy='mean', axis=0)
    all_conti_x_fitted = imp.fit_transform(all_conti_x)

    def normalize(all_conti_x_fitted, all_categ_x):
        # using the MinMaxScaler to normalization the all_x
        min_max_scaler = MinMaxScaler()
        all_conti_x_fitted = min_max_scaler.fit_transform(all_conti_x_fitted)
        # print (all_conti_x_fitted.shape, all_conti_x_fitted)
        # all_conti_x = DataFrame(all_conti_x_fitted, columns=all_conti_x.columns)
        # print (all_conti_x.shape)

        onehot_enc = OneHotEncoder(sparse=False)  # dense format
        all_categ_x_fitted = onehot_enc.fit_transform(all_categ_x)
        # print (all_categ_x_fitted.shape, all_categ_x_fitted)
        return all_conti_x_fitted, all_categ_x_fitted

    if isnormalized:
        all_conti_x_fitted, all_categ_x_fitted = normalize(all_conti_x_fitted, all_categ_x)

    x = np.hstack((all_conti_x_fitted, all_categ_x_fitted))
    # y = all_xy['label']
    # x = np.array(x)
    # y = np.array(y)
    # print (x.shape, y.shape)
    # return x, y
    y = all_xy['label']
    z_icustay_id = y.index
    x = np.array(x)
    y = np.array(y)
    z_icustay_id = np.array(z_icustay_id)

    print (x.shape, y.shape)
    return x, y, z_icustay_id, all_xy


In [None]:
def perf_model(pipe, param_grid, name, X_train, X_test,
               y_train, y_test, scoring, verbose=0):
    gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scoring, cv=5, n_jobs=-1, verbose=verbose)
    gs.fit(X_train, y_train)

    y_train_pred = gs.predict(X_train)
    y_test_pred = gs.predict(X_test)

    acc_train = accuracy_score(y_true=y_train, y_pred=y_train_pred)
    acc_test = accuracy_score(y_true=y_test, y_pred=y_test_pred)

    fpr, tpr, _ = roc_curve(y_train, gs.predict_proba(X_train)[:, 1])
    auc_train = auc(fpr, tpr)

    fpr, tpr, _ = roc_curve(y_test, gs.predict_proba(X_test)[:, 1])
    auc_test = auc(fpr, tpr)

    confmat_train = confusion_matrix(y_true=y_train, y_pred=y_train_pred)
    confmat_test = confusion_matrix(y_true=y_test, y_pred=y_test_pred)

    print (' best parameter: ', gs.best_params_)
    print (' training acc:%.2f auc:%.2f ' % (acc_train, auc_train))
    print (' testing acc:%.2f auc:%.2f ' % (acc_test, auc_test))

    print (' train confusion matrix:\n', confmat_train)
    print (' testing confusion matrix:\n', confmat_test)
    print (' classification report:\n', classification_report(y_test, y_test_pred))

    train_report = np.array(precision_recall_fscore_support(y_train, y_train_pred))
    train_class1_report = train_report[:, 1]
    train_metrics = list(train_class1_report[:-1])
    train_metrics.extend([acc_train, auc_train])
    print ('training metrics: precision, recall, f1-score, acc, auc')
    print (train_metrics)

    test_report = np.array(precision_recall_fscore_support(y_test, y_test_pred))
    test_class1_report = test_report[:, 1]
    test_metrics = list(test_class1_report[:-1])
    test_metrics.extend([acc_test, auc_test])
    print ('test metrics: precision, recall, f1-score, acc, auc')
    print (test_metrics)

    return train_metrics, test_metrics
    """
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate (recall)")

    plt.plot(fpr, tpr, label="acc:%f auc:%f" % (acc_test, auc_test))
    plt.legend(loc="best")
    plt.show()
    plt.close()

    precision, recall, _ = precision_recall_curve(y_train, gs.predict_proba(X_train)[:,1])
    average_precision = average_precision_score(y_test, gs.predict_proba(X_test)[:,1])
    plt.xlabel("precision")
    plt.ylabel("recall")
    plt.step(precision, recall, where='post', label='AP={0:0.2f}'.format(average_precision))
    plt.legend(loc="best")
    plt.show()
    plt.close()
    """


In [None]:
def try_dbdt(X_train, X_test, y_train, y_test, scoring):
    gbm = GradientBoostingClassifier(learning_rate=0.05, n_estimators=120, min_samples_leaf=60,
                                     max_features=9, subsample=0.7, random_state=10)

    param_grid = {'max_depth': list(range(3, 14, 2)), 'min_samples_split': list(range(100, 801, 200))}
    train_metrics, test_metrics = perf_model(gbm, param_grid, 'GBDT', X_train, X_test, y_train, y_test, scoring, 0)
    return train_metrics, test_metrics

In [None]:
#issue im having is that 

def try_models_cross(X_train, X_test, y_train, y_test, scoring):#  select data cross 5 Fold
    # X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, stratify=Y, random_state=RANDOM_STATE)
    # """
    # print ('\n\nLinear Logistic Regression with L1 Penalty')
    # lgr_l1_train_metrics, lgr_l1_test_metrics = try_lgr_l1(X_train, X_test, y_train, y_test, scoring)
    #
    # print ('\n\nLinear Logistic Regression with L2 Penalty')
    # lgr_l2_train_metrics, lgr_l2_test_metrics = try_lgr_l2(X_train, X_test, y_train, y_test, scoring)
    #
    # print ('\n\nStochastic Gradient Descent')
    # Elastic_train_metrics, Elastic_test_metrics = try_sgd(X_train, X_test, y_train, y_test, scoring)
    #
    # print ('\n\nRandom Forest')
    # rf_train_metrics, rf_test_metrics = try_rf(X_train, X_test, y_train, y_test, scoring)
    # #
    print ('\n\nGradient Boosting Decision tree')
    xgboost_train_metrics, xgboost_test_metrics = try_dbdt(X_train, X_test, y_train, y_test, scoring)




In [116]:
skf = StratifiedKFold(n_splits=5)

NameError: name 'StratifiedKFold' is not defined

time: 10.8 ms


In [None]:
# # if __name__ == '__main__': #basically execute only if run as a script. i will unravel this so i can run it inline here

#ge:reading in a file when running as script

# path = './logs/individualization_24_1th.txt'
# f = open(path, 'a+')
# orig_stdout = sys.stdout
# sys.stdout = f



for time_interval in [24]:  # ,48,72,96,120,144]:
    x, y, z_icustay_id, all_xy = preprocessing(folder, time_interval)  # all_xy is for compute gower distance

    skf = StratifiedKFold(n_splits=5)
    print '%%%%%'
    num_fold = 0
    for train_index, test_index in skf.split(x, y):
        print '***************'
        # print 'This is the '+ str(i)+' times result of '+str(n_fold)+' fold'
        X_train_0, X_test_0 = x[train_index], x[test_index]
        y_train_0, y_test_0 = y[train_index], y[test_index]

        print '#####################'

        num_fold = num_fold + 1
        print 'this is the results of the %d fold in 5 folds:' %num_fold

        print 'the number of testing samples in this fold:', test_index.size

        train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
        test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

        xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
        xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

        lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
        lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

        indicator_time = 0 # the indicator
        for i, j in zip(test_z_icustay_id, test_index):
            # i_index = np.where(test_z_icustay_id == i)
            # tem_test_z_icustay_id = np.delete(test_z_icustay_id, i_index)
            testing_sample_id = i

            all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
            all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

            m = 400  # m is the number of similar cases or similar controls

            X_test_00 = x[j]
            y_test = y[j]

            X_test = X_test_00.reshape(1, -1)

            # print 'start selecting......'

            Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

            ix = np.isin(z_icustay_id, Id_train_set)
            Id_train_set_index = list(np.where(ix))

            # Id_train_set_index = np.argwhere(z_icustay_id == Id_train_set)

            X_train = x[Id_train_set_index]
            y_train = y[Id_train_set_index]

            # print 'start training......'

            # scoring = 'roc_auc'

# xgboost

            xgboost_mod = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                          min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                          objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
            xgboost_mod.fit(X_train, y_train)
            xg_y_pred = xgboost_mod.predict(X_test)
            xg_y_pred_proba = xgboost_mod.predict_proba(X_test)[:,1]

            xg_one_fold_pred.append(xg_y_pred)
            xg_one_fold_proba.append(xg_y_pred_proba)

# lr 

            logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                        intercept_scaling=1, class_weight='balanced', random_state=None)
            logreg.fit(X_train, y_train)
            lr_y_pred = logreg.predict(X_test)
            lr_y_pred_proba = logreg.predict_proba(X_test)[:,1]

            lr_one_fold_pred.append(lr_y_pred)
            lr_one_fold_proba.append(lr_y_pred_proba)

            indicator_time = indicator_time + 1
            # print 'the next testing sample and total samples:', indicator_time, test_index.size

        xg_y_individual_pred = np.array(xg_one_fold_pred)
        xg_y_individual_proba = np.array(xg_one_fold_proba)

        lr_y_individual_pred = np.array(lr_one_fold_pred)
        lr_y_individual_proba = np.array(lr_one_fold_proba)

        one_fold_y_test = y[test_index]

        print 'this is the result of individual predictor using xgboost:'
        print 'the acc of one fold:', accuracy_score(one_fold_y_test, xg_y_individual_pred)
        print 'the classification_report :', classification_report(one_fold_y_test, xg_y_individual_pred)
        print 'the auc of one fold:', roc_auc_score(one_fold_y_test, xg_y_individual_proba)

        print 'this is the result of individual predictor using lr:'
        print 'the acc of one fold:', accuracy_score(one_fold_y_test, lr_y_individual_pred)
        print 'the classification_report :', classification_report(one_fold_y_test, lr_y_individual_pred)
        print 'the auc of one fold:', roc_auc_score(one_fold_y_test, lr_y_individual_pred)

# using non-individual predictor for classification

        xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                                    min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                    objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
        xgboost_random.fit(X_train_0, y_train_0)
        y_pred_random = xgboost_random.predict(X_test_0)
        y_proba_random = xgboost_random.predict_proba(X_test_0)[:,1]

        y_test_random = y[test_index]

        print 'this is the result of non-individual predictor using xgboost:'
        print 'the acc is:',accuracy_score(y_test_random, y_pred_random)
        print 'the classification_report:', classification_report(y_test_random, y_pred_random)
        print 'the auc is:', roc_auc_score(y_test_random, y_proba_random)

        logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
                                    intercept_scaling=1, class_weight='balanced', random_state=None)
        logreg_random.fit(X_train_0, y_train_0)
        lr_y_pred_random = logreg_random.predict(X_test_0)
        lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]

        print 'this is the result of non-individual predictor using lr:'
        print 'the acc is:',accuracy_score(y_test_random, lr_y_pred_random)
        print 'the classification_report:', classification_report(y_test_random, lr_y_pred_random)
        print 'the auc is:', roc_auc_score(y_test_random, lr_y_pred_proba_random)

        # break

In [None]:

# if __name__ == '__main__': #basically execute only if run as a script. i will undo this
#     path = './logs/individualization_24_1th.txt'
#     f = open(path, 'a+')
#     orig_stdout = sys.stdout
#     sys.stdout = f
#     for time_interval in [24]:  # ,48,72,96,120,144]:
#         x, y, z_icustay_id, all_xy = preprocessing(folder, time_interval)  # all_xy is for compute gower distance

#         skf = StratifiedKFold(n_splits=5)
#         print '%%%%%'
#         num_fold = 0
#         for train_index, test_index in skf.split(x, y):
#             print '***************'
#             # print 'This is the '+ str(i)+' times result of '+str(n_fold)+' fold'
#             X_train_0, X_test_0 = x[train_index], x[test_index]
#             y_train_0, y_test_0 = y[train_index], y[test_index]

#             print '#####################'

#             num_fold = num_fold + 1
#             print 'this is the results of the %d fold in 5 folds:' %num_fold

#             print 'the number of testing samples in this fold:', test_index.size

#             train_z_icustay_id = z_icustay_id[train_index] # the icustay_id of samples in training set from 5 fold
#             test_z_icustay_id = z_icustay_id[test_index] # the icustay_id of samples in testing set from 5 fold

#             xg_one_fold_pred = [] # obtain the pred label of testing samples for one fold using xgboost
#             xg_one_fold_proba = [] # obtain the proba  of testing samples for one fold using xgboost

#             lr_one_fold_pred = [] # obtain the pred label of testing samples for one fold using lr
#             lr_one_fold_proba = [] # obtain the proba  of testing samples for one fold using lr

#             indicator_time = 0 # the indicator
#             for i, j in zip(test_z_icustay_id, test_index):
#                 # i_index = np.where(test_z_icustay_id == i)
#                 # tem_test_z_icustay_id = np.delete(test_z_icustay_id, i_index)
#                 testing_sample_id = i

#                 all_xy_0 = all_xy.loc[train_z_icustay_id] # select training samples from  5 fold
#                 all_xy_training = all_xy_0.append(all_xy.loc[i]) # note that , containing the i

#                 m = 400  # m is the number of similar cases or similar controls

#                 X_test_00 = x[j]
#                 y_test = y[j]

#                 X_test = X_test_00.reshape(1, -1)

#                 # print 'start selecting......'

#                 Id_train_set = select_train_samples(testing_sample_id, all_xy_training, m, time_interval)  #  individulization

#                 ix = np.isin(z_icustay_id, Id_train_set)
#                 Id_train_set_index = list(np.where(ix))

#                 # Id_train_set_index = np.argwhere(z_icustay_id == Id_train_set)

#                 X_train = x[Id_train_set_index]
#                 y_train = y[Id_train_set_index]

#                 # print 'start training......'

#                 # scoring = 'roc_auc'

# # xgboost

#                 xgboost_mod = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
#                               min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
#                               objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
#                 xgboost_mod.fit(X_train, y_train)
#                 xg_y_pred = xgboost_mod.predict(X_test)
#                 xg_y_pred_proba = xgboost_mod.predict_proba(X_test)[:,1]

#                 xg_one_fold_pred.append(xg_y_pred)
#                 xg_one_fold_proba.append(xg_y_pred_proba)

# # lr 

#                 logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
#                                             intercept_scaling=1, class_weight='balanced', random_state=None)
#                 logreg.fit(X_train, y_train)
#                 lr_y_pred = logreg.predict(X_test)
#                 lr_y_pred_proba = logreg.predict_proba(X_test)[:,1]

#                 lr_one_fold_pred.append(lr_y_pred)
#                 lr_one_fold_proba.append(lr_y_pred_proba)

#                 indicator_time = indicator_time + 1
#                 # print 'the next testing sample and total samples:', indicator_time, test_index.size

#             xg_y_individual_pred = np.array(xg_one_fold_pred)
#             xg_y_individual_proba = np.array(xg_one_fold_proba)

#             lr_y_individual_pred = np.array(lr_one_fold_pred)
#             lr_y_individual_proba = np.array(lr_one_fold_proba)

#             one_fold_y_test = y[test_index]

#             print 'this is the result of individual predictor using xgboost:'
#             print 'the acc of one fold:', accuracy_score(one_fold_y_test, xg_y_individual_pred)
#             print 'the classification_report :', classification_report(one_fold_y_test, xg_y_individual_pred)
#             print 'the auc of one fold:', roc_auc_score(one_fold_y_test, xg_y_individual_proba)

#             print 'this is the result of individual predictor using lr:'
#             print 'the acc of one fold:', accuracy_score(one_fold_y_test, lr_y_individual_pred)
#             print 'the classification_report :', classification_report(one_fold_y_test, lr_y_individual_pred)
#             print 'the auc of one fold:', roc_auc_score(one_fold_y_test, lr_y_individual_pred)

# # using non-individual predictor for classification

#             xgboost_random = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
#                                         min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
#                                         objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
#             xgboost_random.fit(X_train_0, y_train_0)
#             y_pred_random = xgboost_random.predict(X_test_0)
#             y_proba_random = xgboost_random.predict_proba(X_test_0)[:,1]

#             y_test_random = y[test_index]

#             print 'this is the result of non-individual predictor using xgboost:'
#             print 'the acc is:',accuracy_score(y_test_random, y_pred_random)
#             print 'the classification_report:', classification_report(y_test_random, y_pred_random)
#             print 'the auc is:', roc_auc_score(y_test_random, y_proba_random)

#             logreg_random = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=10, fit_intercept=True,
#                                         intercept_scaling=1, class_weight='balanced', random_state=None)
#             logreg_random.fit(X_train_0, y_train_0)
#             lr_y_pred_random = logreg_random.predict(X_test_0)
#             lr_y_pred_proba_random = logreg_random.predict_proba(X_test_0)[:, 1]

#             print 'this is the result of non-individual predictor using lr:'
#             print 'the acc is:',accuracy_score(y_test_random, lr_y_pred_random)
#             print 'the classification_report:', classification_report(y_test_random, lr_y_pred_random)
#             print 'the auc is:', roc_auc_score(y_test_random, lr_y_pred_proba_random)

#             # break
#     sys.stdout = orig_stdout
#     f.close()

## need to have a robust evaluation of model performance


## need to try cross validation