In [1]:
import sys
import os
import pandas as pd
import numpy as np
from copy import deepcopy
import xgboost as xgb
import random
import statsmodels.api as sm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.base import BaseEstimator, TransformerMixin

# imbalance learn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# deepnet 
sys.path.insert(0, 'ENNS')
from dnp2 import DeepNet

import warnings
warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"
_ = np.seterr(divide='ignore', invalid='ignore')

In [2]:
# NOTE: The ENNS feature selector below might cause segmentation fault in python 3.9.
from platform import python_version
print(python_version())

3.8.6


## Load genomic features and their corresponding heteroresistance labels

In [3]:
# use ALL features (SNV, InDel, CNV)
df_X = pd.read_csv('data/Xmatrix_all_features.csv', index_col=0)
df_y = pd.read_csv("data/Ylabel.csv", index_col=0).astype(int).loc[df_X.index]
df_X.shape, df_y.shape

((160, 6806), (160, 1))

In [4]:
##################################################
# If only SNV is used, unblock the following codes
#df_X = pd.read_csv('data/Xmatrix_SNV_only.csv', index_col=0)
#df_y = pd.read_csv("data/Ylabel.csv", index_col=0)[['HR']].astype(int).loc[df_X.index]
#df_X.shape, df_y.shape
##################################################

## Random subsampling validation in the outer loop

In [5]:
# It might take several days to a week to complete over 50 subsampling repeats.
# Therefore, I set N to 2 for a quicker run.
N = 2
random_state = 42

In [6]:
###############################################################################
# If you want to try the 50 data splits used to generate the figures,
# please uncomment the following line of code and ignore the codes in this cell
#
# df_data_split = pd.read_csv("data/train_test_splits.csv")
#
###############################################################################
train_test_splits = []
for iteration in np.arange(0,N):    
    df_X_shuffle = deepcopy(df_X.sample(frac=1))
    df_y_shuffle = df_y.loc[df_X_shuffle.index]
    
    # get train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(df_X_shuffle, df_y_shuffle, test_size=0.20, random_state=random_state, stratify=df_y_shuffle.HR)
    train_isolates = (',').join(list(X_train.index))
    test_isolates = (',').join(list(X_test.index))
    
    train_test_splits.append([iteration, train_isolates, test_isolates])
df_data_split = pd.DataFrame(train_test_splits, columns=['iteraction', 'train_set_isolates','test_set_isolates'])
df_data_split.head()

Unnamed: 0,iteraction,train_set_isolates,test_set_isolates
0,0,"CDC336,MSK808,E12,E81,MSK1666,E1,MSK2191,MSK24...","E38,MSK844,E80,CDC337,UWM1195,E44,CDC344,MSK24..."
1,1,"MSK2413,E36,E12,MSK800,GL28,CDC340,MSK314,MSK2...","E34,E10,E81,CDC336,MSK804,E51,E54,MSK2448,E14,..."


## Define feature selection methods

In [7]:
# logistic regression
class feature_selector_logit(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=10):
        self.n_features = n_features
        self.selected_features = None
    def fit(self, X, y=None):
        tmp = []
        for f in X.columns:
            xk = X[[f]]
            yk = y[['HR']]
            logit_model=sm.Logit(yk, sm.add_constant(xk))
            resk=logit_model.fit(method='bfgs', disp=0)
            tmp.append([f, resk.summary2().tables[1].loc[f,'P>|z|']])
        df_tmp = pd.DataFrame(tmp, columns=['Feature','Pvalue'])
        df_tmp = df_tmp[df_tmp.Pvalue.notnull()]
        df_tmp = df_tmp.sort_values('Pvalue')
        self.selected_features = list(df_tmp.iloc[:self.n_features].Feature)
        return self
    def transform(self, X, y=None, **kwargs):
        return X[self.selected_features]

# Lasso
class feature_selector_lasso(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=10):
        self.n_features = n_features
        self.selected_features = None
    def fit(self, X, y=None):
        reg = LassoCV(
            cv=3,
            random_state=42,
            n_alphas=1000,
            max_iter=100000,
            verbose=0,
            n_jobs=-1
        ).fit(X.values, np.ravel(y.values))
        df_reg = pd.DataFrame(reg.coef_, index=X.columns, columns=['Coef'])
        df_reg = df_reg[df_reg.Coef != 0]
        df_reg['AbsCoef'] = np.abs(df_reg['Coef'])
        df_reg = df_reg.sort_values('AbsCoef', ascending=False)
        
        # deal with situation when no feature was selected
        if len(df_reg) == 0:
            # then randomly selected n_features
            self.selected_features = random.sample(X.columns, self.n_features)
        else:
            if len(df_reg) > self.n_features:
                df_reg = df_reg.iloc[:self.n_features,:]
                assert len(df_reg) == self.n_features
            self.selected_features = list(df_reg.index)
        return self
    def transform(self, X, y=None, **kwargs):
        return X[self.selected_features]
    
# ENNS
class feature_selector_enns(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=10):
        self.n_features = n_features
        self.selected_features = None
    def fit(self, X, y=None):
        slc = DeepNet(max_feature=self.n_features)
        self.selected_features = [list(X.columns)[idx-1] for idx in slc.train(X, y, return_select=True, verbosity=2)]
        return self
    def transform(self, X, y=None, **kwargs):
        return X[self.selected_features]
    
feature_selectors = {
    'Logit':feature_selector_logit(),
    'Lasso':feature_selector_lasso(),
    'ENNS':feature_selector_enns()
}

## Define classifiers

In [8]:
random_forest = RandomForestClassifier(n_estimators=1000, random_state=random_state, class_weight="balanced")
xgboost = xgb.XGBClassifier(n_estimators=1000, seed=random_state, objective='binary:logistic')
classifiers = {
    'RFC':random_forest,
    'XGB':xgboost
}

## Model training and evaluation

In [9]:
evaluation = []
for iteration in np.arange(0,N):
    print('Current iteration:', iteration)
    
    # get the outer training and test dataset for each iteration
    train_isolates = df_data_split.loc[iteration,'train_set_isolates'].split(',')
    test_isolates = df_data_split.loc[iteration,'test_set_isolates'].split(',')
    X_train = deepcopy(df_X.loc[train_isolates])
    X_test = deepcopy(df_X.loc[test_isolates])
    y_train = deepcopy(df_y.loc[train_isolates])
    y_test = deepcopy(df_y.loc[test_isolates])

    #####################################################################
    # If using undersampling or oversampling, unblock the following codes
    # 
    # oversampling
    # smote = SMOTE(random_state=random_state)
    # X_train, y_train = smote.fit_resample(X_train, y_train)
    #
    # undersampling
    # rus = RandomUnderSampler(random_state=random_state)
    # X_train, y_train = rus.fit_resample(X_train, y_train)
    #####################################################################

    ###################################################################
    # If shuffling heteroresistance labels, unblock the following codes
    # 
    # y_train['HR'] = y_train['HR'].sample(frac=1).values
    ###################################################################
    
    # use the outer training dataset for feature selection and hyperparameter tuning
    # use the outer test dataset for model evaluation
    for feature_selector in ['Logit','Lasso','ENNS']:
        for classifier in ['RFC','XGB']:
            print('training model (%s + %s)...'%(feature_selector, classifier))
            
            if classifier=='RFC':
                parameters = {
                    'feature_selector__n_features':[2,4,6,8,10],
                    'classifier__max_features':['sqrt',None],
                    'classifier__max_depth':[2,4,6],
                    'classifier__max_samples':[0.5,0.75,1.0]
                }
            elif classifier=='XGB':
                parameters = {
                    'feature_selector__n_features':[2,4,6,8,10],
                    'classifier__colsample_bytree':[0.5,0.75,1.0],
                    'classifier__max_depth':[2,4,6],
                    'classifier__learning_rate':[0.3,0.1,0.03]
                }

            # create pipeline
            scaler = StandardScaler().set_output(transform="pandas")
            thresholder = VarianceThreshold().set_output(transform="pandas")
            pipeline = Pipeline(
                steps=[
                    ("scaler", scaler), 
                    ("thresholder", thresholder),
                    ("feature_selector", feature_selectors[feature_selector]), 
                    ("classifier", classifiers[classifier])
                ]
            )

            # run grid search for model training
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                CV = GridSearchCV(pipeline, parameters, scoring='f1', cv=StratifiedKFold(n_splits=5), n_jobs=-1, verbose=0)
                CV.fit(X_train,y_train)
            
            # compute training scores
            final_model = CV.best_estimator_
            y_pred = CV.predict(X_train)
            accuracy_train = accuracy_score(y_true=y_train, y_pred=y_pred)
            precision_train = precision_score(y_true=y_train, y_pred=y_pred)
            recall_train = recall_score(y_true=y_train, y_pred=y_pred)
            f1_train = f1_score(y_true=y_train, y_pred=y_pred)

            # compute test scores
            y_pred = CV.predict(X_test)
            accuracy_test = accuracy_score(y_true=y_test, y_pred=y_pred)
            precision_test = precision_score(y_true=y_test, y_pred=y_pred)
            recall_test = recall_score(y_true=y_test, y_pred=y_pred)
            f1_test = f1_score(y_true=y_test, y_pred=y_pred)
            FN_test = []
            FP_test = []
            for ti,y1,y2 in zip(list(X_test.index),list(y_pred),list(y_test.HR)):
                if y1 == 0 and y2 == 1:
                    FN_test.append(ti)
                if y1 == 1 and y2 == 0:
                    FP_test.append(ti)
            
            # get selected features
            X_train2 = scaler.fit_transform(X_train)
            X_train2 = thresholder.fit_transform(X_train2)
            n_features = CV.best_params_['feature_selector__n_features']
            selected_features = CV.best_estimator_.steps[-2][1].selected_features
            feature_importance = list(CV.best_estimator_.steps[-1][1].feature_importances_)
            selected_features = ["%s (%2.4f)"%(x,y) for y, x in sorted(zip(feature_importance, selected_features), reverse=True)] 

            # get best parameters
            bestp = ''
            for k,v in CV.best_params_.items():
                bestp += '%s:%s,'%(k,v)
            bestp = bestp.rstrip(',')
                
            # save to results
            evaluation.append(
                [iteration, feature_selector, classifier, n_features, (',').join(selected_features), bestp,
                 accuracy_train, precision_train, recall_train, f1_train,
                 accuracy_test, precision_test, recall_test, f1_test,
                 (',').join(FN_test), (',').join(FP_test)]
            )

    print('Done.')
    print()
    
df_eval = pd.DataFrame(evaluation, 
                       columns=[
                           'iteration','feature_selector','classification_model','best_model_n_features','selected_features','best_params',
                           'accuracy_train','precision_train','recall_train','f1_train',
                           'accuracy_test','precision_test','recall_test','f1_test',
                           'false_negative_test','false_positive_test']
                      )
df_eval.head()

Current iteration: 0
training model (Logit + RFC)...
training model (Logit + XGB)...
training model (Lasso + RFC)...
training model (Lasso + XGB)...
training model (ENNS + RFC)...
training model (ENNS + XGB)...
Done.

Current iteration: 1
training model (Logit + RFC)...
training model (Logit + XGB)...
training model (Lasso + RFC)...
training model (Lasso + XGB)...
training model (ENNS + RFC)...
training model (ENNS + XGB)...
Done.



Unnamed: 0,iteration,feature_selector,classification_model,best_model_n_features,selected_features,best_params,accuracy_train,precision_train,recall_train,f1_train,accuracy_test,precision_test,recall_test,f1_test,false_negative_test,false_positive_test
0,0,Logit,RFC,8,"Pattern_2154 (0.4046),Pattern_1139 (0.1552),Pa...","classifier__max_depth:4,classifier__max_featur...",0.867188,0.8,0.780488,0.790123,0.875,0.875,0.7,0.777778,"E38,E34,E46",MSK800
1,0,Logit,XGB,8,"Pattern_2154 (0.4361),Pattern_4466 (0.3562),Pa...","classifier__colsample_bytree:0.5,classifier__l...",0.859375,0.828571,0.707317,0.763158,0.84375,0.857143,0.6,0.705882,"E38,CDC344,E34,E46",MSK800
2,0,Lasso,RFC,8,"Pattern_2154 (0.2486),Pattern_5814 (0.2331),Pa...","classifier__max_depth:2,classifier__max_featur...",0.867188,0.875,0.682927,0.767123,0.84375,0.857143,0.6,0.705882,"E38,CDC344,E34,E46",MSK800
3,0,Lasso,XGB,4,"Pattern_2154 (0.5826),Pattern_5513 (0.4174),Pa...","classifier__colsample_bytree:0.5,classifier__l...",0.84375,0.783784,0.707317,0.74359,0.875,0.875,0.7,0.777778,"E38,E34,E46",MSK800
4,0,ENNS,RFC,4,"Pattern_2154 (0.3976),Pattern_4466 (0.3087),Pa...","classifier__max_depth:2,classifier__max_featur...",0.851562,0.775,0.756098,0.765432,0.875,0.875,0.7,0.777778,"E38,E34,E46",MSK800
