In [526]:
import os
import sys
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from copy import deepcopy
import numpy as np
import scipy.stats as stats
from sklearn import preprocessing, linear_model
from sklearn.model_selection import KFold
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils import check_array
from sklearn.ensemble import RandomForestClassifier

In [481]:
def kfold_feature_learning(train, test, y, t_y, clf = linear_model.LassoCV(cv=10), problem = 'regression', 
                           folds = 10, scale=True, verbose = True, search = False,
                           p_cutoff = None, regcols = None, regdf = None, keep_cols = None, 
                           out_dir = None,  output='light', save_int = True):
    '''
    This is a function that will use nested cross validation to generate an average model
    that will hopefully generalize better to unseen test data. 
    
    You can must your training and testing data, and your y variable for both, the model you 
    wish to use for  prediction, and whether the problem is classification or regression.
    
    The function will run K iterations of prediction on your training set, and will average 
    the weights across folds for a final model. The final model will then be applied to your 
    testing data. The validation and testing accuracy will be displayed. 
    
    Several other options exists (see below), and are forthcoming.
    
    ATTENTION: THIS SCRIPT IS STILL IN DEVELOPMENT. IT IS UGLY AND UNFINISHED, SO DONT JUDGE
    
    
    *** USER-DEFINED ARGUMENTS ***
    
    -- train is a subjects x variables dataframe (this represents your training data)
    -- y is a pandas series with the same index as train. y should not be in train
    
    # NOTE: train and test indices should just be a range
    
    -- test is a subjects x variables dataframe (this represents your independent test data)
    -- t_y is a pandas seris with the same index as test. y should not be in tets

    
    *** MODEL OPTIONS ***

    -- clf: here, you can put in whatever model you want with whatever parameters you want.
    
    -- if your model (clf) is a regression model (e.g. Lasso, SVR), leave problem as "regression". 
    If it is a classification model (e.g. SVM, SGD, etc.), change problem to "classification"
    
    -- folds: how many fold cross-validation should occur within the outer loop of the t
    raining dataset
    
    -- scale: if True, train will be scaled with a Standard Scaler, and test will be transformed 
    to this scale
    
    -- verbose: if you do not want any output (including scores at the end!!), set this to False.
    
    -- search: if clf is a model_selector (such as GridSearch), MAKE SURE you set this to True,
    or the script will fail.
    
    
    *** FEATURE SELECTION OPTIONS ***
    
    -- p_cutoff: if you wish to only keep features statistically related to y (through t-test 
    or correlation), you can control the alpha value here. Leave as None to use all features
    
    -- reg_cols: a list of column labels in regdf. All labels specified in this list will be 
    regressed out of all other model features during "feature selection" (i.e. when features are
    removed via the p_cutoff argument). In other words, this argument can be used if you only
    want to include features in your model that are singificant when adjusting for the variables
    specified in reg_cols. Leave as None if you don't want this argument active.
    
    -- regdf: a subject x variables pandas Dataframe that contain anything as long as it has all 
    (matched) indices in train, and all columns in reg_cols
    
    -- keep_cols: a list of column labels. These are variables in train that you wish to retain 
    in your model no matter what, even if they do not pass the feature selection. For example, if
    age is not selected by your p_cutoff, but you still want age in your model, you can list in
    keep_cols
    
    *** OUTPUT OPTIONS ***
    
    -- out_dir: will save your weight matrix and predicted values to a directory you specify
    
    -- output: decide what you want the function to return:
        * 'scores' will only return the r2 (regression) or sensitivity, specificity and accuracy
            of you validation and test.
        * 'light' will return the weights of your final model, the predicted values of your
            validation, the predicted values of your test, and the intercept of the final model, 
            in that order.
        * 'heavy' if problem = regression, will return everything from light, plus a matrix 
            containing weights from all folds of the validations. Will also return the model 
            from the most recent fold.
            if problem = classification, will return a summary dataframe (with weights) for your
            validation, a summary dataframe for your test, your predicted values from validation,
            predicted values from test, a matrix containing weights from all folds of the 
            validation, and the model from the most recent fold.
        
        '''
    
    
    if problem != 'regression' and problem != 'classification':
        raise IOError('please set problem to regression or classification')
    
    #feature_matrix = pandas.DataFrame(np.zeros_like(train))
    
    # Initiate variables
    predicted = []
    all_weights = pandas.DataFrame(np.zeros((folds,len(train.columns))))
    if save_int:
        ints = []
    start = 0
    fold = 1
    
    # scale inputs
    if scale:
        master_scl = preprocessing.StandardScaler().fit(train)
        train = pandas.DataFrame(master_scl.transform(train),
                                 index=train.index,columns=train.columns)
        test = pandas.DataFrame(master_scl.transform(test),
                                 index=test.index,columns=test.columns)
    
    # strip columns names
    tr_cols = train.columns 
    train.columns = range(len(train.columns))
    test.colums = range(len(test.columns))
    
    
    for tr_ix, te_ix in KFold(n_splits=folds).split(train):
        tmp_mtx = train.loc[train.index[tr_ix]] # working matrix
        
        # Build regression statements (if regcols)
        if regcols != None: 
            ref = deepcopy(tmp_mtx)
            tmp_mtx.columns = ['x_%s'%x for x in tmp_mtx.columns]
            tmp_mtx['y'] = y.loc[tmp_mtx.index]
            stmnt = 'y ~'
            for z,col in enumerate(regcols):
                cov = 'cov_%s'%z
                tmp_mtx[cov] = regdf.loc[tmp_mtx.index][col]
                if z == 0:
                    stmnt += ' %s'%cov
                else:
                    stmnt += ' + %s'%cov
        else:
            regcols = []
        
        # feature selection -- only retain significant features
        ps = []
        if p_cutoff != None:
            if len(regcols) > 0:
                if verbose:
                    print('running regression for fold %s of %s'%(fold,folds))
                for x in range(tmp_mtx.shape[1] - (len(regcols) + 1)):
                    n_stmnt = '%s + x_%s'%(stmnt,x)
                    ps.append(smf.ols(stmnt,data=temp_mtx).fit().pvalues[-1])
                sig_mtx = ref.loc[ref.index[:]]
            else:
                if problem == 'regression':
                    if verbose:
                        print('running correlation for fold %s of %s'%(fold,folds))
                    for x in range(tmp_mtx.shape[1]):
                        ps.append(stats.pearsonr(
                                y[tmp_mtx.index].values,tmp_mtx.values[:,x])[1]
                             )
                else: # classification
                    if verbose:
                        print('running ttests for fold %s of %s'%(fold,folds))
                    for x in range(tmp_mtx.shape[1]):
                        ps.append(stats.ttest_ind(
                                tmp_mtx.loc[y[tmp_mtx.index][y[tmp_mtx.index]==0].index][tmp_mtx.columns[x]],
                                tmp_mtx.loc[y[tmp_mtx.index][y[tmp_mtx.index]==1].index][tmp_mtx.columns[x]]
                             )[1])
            ps_s = pandas.Series(ps)
            sig = ps_s[ps_s < p_cutoff]
            if len(sig) == 0:
                fold += 1
                continue
            sig_mtx = tmp_mtx[sig.index]
        else:
            sig_mtx = tmp_mtx[tmp_mtx.columns[:]]
        
        # run model
        if verbose:
            print('running model for fold %s of %s'%(fold,folds))
        if type(keep_cols) == list:
            for col in keep_cols:
                sig_mtx[col] = tmp_mtx.ix[:,col]
        if search:
            mod_sel = clf.fit(sig_mtx,y[sig_mtx.index])
            new_clf = mod_sel.best_estimator_
            model = new_clf.fit(sig_mtx,y[sig_mtx.index])
        else:
            model = clf.fit(sig_mtx,y[sig_mtx.index])
        try:
            all_weights.loc[(fold-1)][sig_mtx.columns] = model.coef_
        except:
            all_weights.loc[(fold-1)][sig_mtx.columns] = model.coef_[0,:]
        # save predicted values for this validation fold
        [predicted.append(x) for x in model.predict(train.loc[train.index[te_ix]][
                                                    sig_mtx.columns].values)]
        if save_int:
            ints.append(model.intercept_)
        
        # reset variables
        fold += 1
        if len(regcols) == 0:
            regcols = None
        
        # save output
        if out_dir != None and type(out_dir) == str:
            print('saving matrix for fold %s of %s'%(fold,folds))
            if not os.path.isdir(out_dir):
                os.mkdir(out_dir)
            feature_matrix.to_csv(os.path.join(out_dir,'lasso_weights.csv'))
            pandas.DataFrame(pandas.Series(predicted)).to_csv(
                                                    os.path.join(out_dir,'lasso_predicted.csv'))
        
    # assemble final model
    final_weights = all_weights.mean(axis=0)
    n_feats = len([i for i in final_weights.index if abs(final_weights[i]) > 0 ])
    if verbose:
        print(n_feats,'features selected')
    
    if n_feats == 0:
        val_res, t_res = np.nan, np.nan
        predicted, t_predicted = [], np.array([])
        if save_int:
            all_ints = np.mean(ints)
        else:
            all_ints = np.nan
        val_sum, t_sum = pandas.DataFrame(), pandas.DataFrame()
    else:
    
        # run validation
        if problem == 'regression':
            r,p = stats.pearsonr(y[train.index],predicted)
            val_res = (r**2)*100
            if verbose:
                print('validation prediction accuracy is %s percent \n p = %s \n r = %s'%(val_res,p,r))
        else:
            val_sum, val_res = manual_classification(y[train.index],predicted,verbose,'validation')

        # apply model to test data
        ntest = check_array(test,accept_sparse='csr')
        t_predicted = pandas.Series(safe_sparse_dot(ntest,np.array(final_weights).T,dense_output=True),index=test.index)
        if save_int:
            all_ints = np.mean(ints)
            t_predicted += all_ints
        else:
            all_ints = []

        # run test
        if problem == 'regression':
            r,p = stats.pearsonr(t_y[test.index],t_predicted)
            t_res = (r**2)*100
            if verbose:
                print('testing prediction accuracy is %s percent \n p = %s \n r = %s'%(t_res,p,r))
        else: # classification
            t_decision_func = t_predicted
            t_predicted = pandas.Series(index = test.index)
            t_predicted[t_decision_func[t_decision_func<0].index] = 0
            t_predicted[t_decision_func[t_decision_func>0].index] = 1
            t_sum, t_res = manual_classification(t_y[test.index],t_predicted,verbose,'testing',t_decision_func)

    # prepare outputs
    
    final_weights.columns = tr_cols
    all_weights.columns = tr_cols
    
    if output == 'scores':
            return val_res, t_res
    elif output == 'light':
        return final_weights, predicted, t_predicted, all_ints
    else:
        if problem == 'regression':
            return final_weights, predicted, t_predicted, all_ints, all_weights, model
        else:
            return val_sum, t_sum, predicted, t_predicted, all_ints, all_weights, model 

def manual_classification(obs, pred, verbose, mode='validation', weights=None):
            
    if type(obs) == pandas.core.series.Series:
        obs = obs.values
    
    if type(pred) == pandas.core.series.Series:
        pred = pred.values
    
    summary = pandas.DataFrame(index=range(len(obs)),columns = ['Predicted','Actual'])
    summary['Predicted'] = pred
    summary['Actual'] = obs
    if type(weights) != type(None):
        summary['Prediction Function'] = weights
    for x in summary.index: 
        if summary.ix[x,'Predicted'] == summary.ix[x,'Actual']:
            summary.ix[x,'Hit'] = 1
        else:
            summary.ix[x,'Hit'] = 0

    tp,tn,fp,fn = [],[],[],[]
    for i,row in summary.iterrows():
        val = row['Predicted'] - row['Actual']
        if val == 0:
            if row['Actual'] == 1:
                tp.append(i)
            else:
                tn.append(i)
        elif val == 1:
            fp.append(i)
        elif val == -1:
            fn.append(i)
        else:
            print('something went wrong for ',i)

    sens = len(tp)/(len(tp)+len(fn))
    spec = len(tn)/(len(tn)+len(fp))
    acc = (len(tp)+len(tn))/(len(tp)+len(fn)+len(tn)+len(fp))

    if verbose:
        print(mode,' sensitivity:' , sens)
        print(mode,'specificity:' , spec)
        print(mode,'accuracy:', acc)

    results = [sens,spec,acc]

    return summary, results




# GET VARIABLES 

In [331]:
df = pandas.read_csv('/Users/jakevogel/Downloads/adni_bl_all_ica50_scores_20170922.csv')
df.head()

Unnamed: 0,subject,RID,PTID,age_r,sex,mean_gm,TIV,EXAMDATE,scan_date,vbm_file,...,score_41,score_42,score_43,score_44,score_45,score_46,score_47,score_48,score_49,score_50
0,subject0021,21.0,011_S_0021,72.6,0.0,0.290485,1440900.75,10/24/2005,10/10/05,smwrc1rl_T1_scandate_732595_birthdate_705585_0...,...,35.530734,35.859132,32.37315,35.58264,3.5972,20.694638,30.465716,30.551352,17.848491,33.041795
1,subject0023,23.0,011_S_0023,71.7,1.0,0.332182,1544985.5,11/8/2005,10/31/05,smwrc1rl_T1_scandate_732616_birthdate_704876_0...,...,46.745997,37.528679,35.017123,46.444176,-0.549843,28.184171,32.632151,51.51163,16.697365,32.914088
2,subject0059,59.0,067_S_0059,78.875359,0.0,0.282895,1279627.0,12/13/2013,12/13/13,smwrc1rl_T1_scandate_735581_birthdate_707841_0...,...,41.090286,39.981865,31.35258,41.385673,8.823693,29.729746,29.393542,31.002191,17.068733,32.878287
3,subject0069,69.0,100_S_0069,72.9,1.0,0.355717,1762411.5,1/17/2006,12/13/05,smwrc1rl_T1_scandate_732659_birthdate_703094_1...,...,40.169767,38.129086,39.121106,55.116683,0.539394,30.381675,39.192471,36.662482,20.824451,44.217368
4,subject0089,89.0,073_S_0089,65.1,1.0,0.349537,1589139.25,1/31/2006,01/26/06,smwrc1rl_T1_scandate_732703_birthdate_708248_0...,...,37.290587,47.598844,43.118446,47.927207,4.419909,33.71041,37.089623,39.986202,20.966313,45.762426


In [332]:
df.columns.tolist()[:14]

['subject',
 'RID',
 'PTID',
 'age_r',
 'sex',
 'mean_gm',
 'TIV',
 'EXAMDATE',
 'scan_date',
 'vbm_file',
 'CN',
 'MCI',
 'AD',
 'conv_2_AD']

In [333]:
df.shape

(477, 64)

In [334]:
train = df[df.MCI == 0]
train.shape

(269, 64)

In [335]:
X_train = train[train.columns[14:]]
X_train.shape

(269, 50)

In [336]:
test = df[df.MCI == 1]
X_test = test[test.columns[14:]]
X_test.shape

(208, 50)

In [337]:
y_train = train.AD
y_test = test.conv_2_AD

In [338]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scl = scaler.transform(X_train)
X_test_scl = scaler.transform(X_test)

### TEST CLASSIFIER...  

In [131]:
linear_model.SGDClassifier?

In [170]:
clf = linear_model.SGDClassifier(loss='modified_huber',penalty='l1',random_state=123)
clf.fit(X_train_scl,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
       penalty='l1', power_t=0.5, random_state=123, shuffle=True,
       verbose=0, warm_start=False)

In [171]:
clf.score(X_test_scl,y_test)

0.81730769230769229

In [44]:
X_train.head()

Unnamed: 0,score_1,score_2,score_3,score_4,score_5,score_6,score_7,score_8,score_9,score_10,...,score_41,score_42,score_43,score_44,score_45,score_46,score_47,score_48,score_49,score_50
0,36.098624,16.246768,49.423423,11.010808,20.266399,37.892973,38.492352,16.642427,20.942804,20.974013,...,35.530734,35.859132,32.37315,35.58264,3.5972,20.694638,30.465716,30.551352,17.848491,33.041795
1,43.062481,15.9358,53.969064,14.761239,23.692814,32.879597,37.618853,21.509658,23.507751,25.381932,...,46.745997,37.528679,35.017123,46.444176,-0.549843,28.184171,32.632151,51.51163,16.697365,32.914088
2,40.938085,14.878255,39.350441,10.656971,20.024958,28.681817,25.343064,16.091855,24.666095,25.452511,...,41.090286,39.981865,31.35258,41.385673,8.823693,29.729746,29.393542,31.002191,17.068733,32.878287
3,49.790367,18.55532,56.292473,16.788461,16.884327,38.573613,39.971058,22.594754,28.6684,23.856434,...,40.169767,38.129086,39.121106,55.116683,0.539394,30.381675,39.192471,36.662482,20.824451,44.217368
4,44.789159,19.845556,52.652161,19.516458,19.193052,31.4668,39.275097,21.952112,25.94595,30.156671,...,37.290587,47.598844,43.118446,47.927207,4.419909,33.71041,37.089623,39.986202,20.966313,45.762426


### ADD CONFOUNDS....

In [47]:
X_train_conf = train[train.columns[14:].tolist() + train.columns[3:7].tolist()]
X_train_conf.shape

(269, 54)

In [49]:
X_test_conf = test[test.columns[14:].tolist() + test.columns[3:7].tolist()]
X_test_conf.shape

(208, 54)

In [50]:
scaler_conf = preprocessing.StandardScaler().fit(X_train_conf)
X_train_scl_conf = scaler_conf.transform(X_train_conf)
X_test_scl_conf = scaler_conf.transform(X_test_conf)

In [59]:
clf = linear_model.SGDClassifier()
clf.fit(X_train_scl_conf,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [61]:
clf.score(X_test_scl_conf,y_test)

0.76923076923076927

In [60]:
linear_model.SGDClassifier?

In [62]:
X_train_scl_conf = preprocessing.StandardScaler().fit_transform(X_train_conf)
X_test_scl_conf = preprocessing.StandardScaler().fit_transform(X_test_conf)

In [63]:
clf = linear_model.SGDClassifier()
clf.fit(X_train_scl_conf,y_train)
clf.score(X_test_scl_conf,y_test)

0.75480769230769229

In [76]:
linear_model.SGDClassifier?

### How does classification even work...

In [113]:
clf = linear_model.SGDClassifier()
clf.fit(X_train_scl,y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [114]:
iono = pandas.DataFrame(index=range(len(X_test_scl)),columns = ['Predicted','Actual'])
iono['Predicted'] = clf.predict(X_test_scl)
iono['Actual'] = y_test.values

In [115]:
for x in iono.index: 
    if iono.ix[x,'Predicted'] == iono.ix[x,'Actual']:
        iono.ix[x,'Hit'] = 1
    else:
        iono.ix[x,'Hit'] = 0

In [117]:
iono['Decision_function'] = clf.decision_function(X_test_scl)

In [118]:
iono.head()

Unnamed: 0,Predicted,Actual,Hit,Decision_function
0,0.0,0.0,1.0,-481.956618
1,0.0,0.0,1.0,-94.422851
2,0.0,0.0,1.0,-378.204688
3,0.0,0.0,1.0,-324.825397
4,0.0,0.0,1.0,-134.961902


In [119]:
tp,tn,fp,fn = [],[],[],[]
for i,row in iono.iterrows():
    val = row['Predicted'] - row['Actual']
    if val == 0:
        if row['Actual'] == 1:
            tp.append(i)
        else:
            tn.append(i)
    elif val == 1:
        fp.append(i)
    elif val == -1:
        fn.append(i)
    else:
        print('something went wrong for ',i)

In [121]:
print('sensitivity:' , len(tp)/(len(tp)+len(fn)))
print('specificity:' , len(tn)/(len(tn)+len(fp)))
print('manual acc:', (len(tp)+len(tn))/(len(tp)+len(fn)+len(tn)+len(fp)))
print('automated acc:',clf.score(X_test_scl,y_test))

sensitivity: 0.5686274509803921
specificity: 0.8343949044585988
manual acc: 0.7692307692307693
automated acc: 0.769230769231


In [173]:
stats.ttest_ind(train[train.AD==1][train.columns[1]], train[train.AD==0][train.columns[1]])

Ttest_indResult(statistic=4.5398209182293154, pvalue=8.5252042391917115e-06)

In [226]:
final_weights = all_weights.mean(axis=0)
print(len([i for i in final_weights.index if abs(final_weights[i]) > 0 ]),
      'features selected')

35 features selected


In [230]:
summary = pandas.DataFrame(index=range(len(train)),columns = ['Predicted','Actual'])
summary['Predicted'] = predicted
summary['Actual'] = y[train.index]
for x in summary.index: 
    if summary.ix[x,'Predicted'] == summary.ix[x,'Actual']:
        summary.ix[x,'Hit'] = 1
    else:
        summary.ix[x,'Hit'] = 0


In [320]:
jnk_weights, jnk_pred, jnk_t_pred, jnk_ints = kfold_feature_learning(X_train, X_test, y_train, y_test, 
                                                                p_cutoff=0.1,problem = 'classification', folds = 10, 
                        clf = linear_model.SGDClassifier(loss='modified_huber',penalty='l1',random_state=123), 
                        output='light', scale=True, regcols = None, regdf = None,
                        keep_cols = None, save_int = True)

running ttests for fold 1 of 10
running model for fold 1 of 10
intercept: [-341.97684097]
running ttests for fold 2 of 10
running model for fold 2 of 10
intercept: [-354.5060104]
running ttests for fold 3 of 10
running model for fold 3 of 10
intercept: [-325.44977818]
running ttests for fold 4 of 10
running model for fold 4 of 10
intercept: [-357.93907043]
running ttests for fold 5 of 10
running model for fold 5 of 10
intercept: [-349.70098725]
running ttests for fold 6 of 10
running model for fold 6 of 10
intercept: [-373.5826144]
running ttests for fold 7 of 10
running model for fold 7 of 10
intercept: [-313.68999643]
running ttests for fold 8 of 10
running model for fold 8 of 10
intercept: [-465.12533035]
running ttests for fold 9 of 10
running model for fold 9 of 10
intercept: [-365.61860166]
running ttests for fold 10 of 10
running model for fold 10 of 10
intercept: [-383.43910487]
39 features selected
validation  sensitivity: 0.6352941176470588
validation specificity: 0.869565217

### Now build the optimizer!

In [424]:
def feature_learning_optimizer(train, test, y, t_y, problem = 'regression', 
                               clfs = {'model': linear_model.LassoCV(cv=10)}, verbose = False,
                               ps = [None,0.2,0.1,0.05,0.01,0.005,0.001], folds = [2,3,5,10,20], 
                               scale = True, regcols = None, regdf = None, keep_cols = None,
                               outdir = None, cheat = False, optimize_on = 'acc', output = 'light'):

    ntests = len(clfs) * len(ps) * len(folds)
    print('running %s different tests'%(ntests))
    
    cols = ['clf','p','fold','acc']
    if problem == 'classification':
        cols += ['sens','spec']
    if cheat:
        cols += ['test_acc']
    results = pandas.DataFrame(index = range(ntests),columns = cols)
    
    i = 0
    
    if outdir:
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
    
    for model,clf in clfs.items():
        print('*'*10, 'working on model',model,'*'*10)
        for p in ps:
            print('*'*5, 'p = ',str(p),'*'*5)
            for fold in folds:
                print('*'*2, 'using %s fold cross-validation'%fold,'*'*2)
                val_res, t_res =  kfold_feature_learning(train, test, y, t_y, clf, problem, fold, scale, verbose, 
                                                         p, regcols, regdf, keep_cols, output = 'scores')
                results.loc[results.index[i]]['clf'] = model
                results.loc[results.index[i]]['p'] = p
                results.loc[results.index[i]]['fold'] = fold
                if problem == 'regression':
                    results.loc[results.index[i]]['acc'] = val_res
                    if cheat:
                        results.loc[results.index[i]]['test_acc'] = t_res
                else:
                    results.loc[results.index[i]]['acc'] = val_res[-1]
                    results.loc[results.index[i]]['sens'] = val_res[0]
                    results.loc[results.index[i]]['spec'] = val_res[1]
                    if cheat:
                        results.loc[results.index[i]]['test_acc'] = t_res[-1]
                if outdir:
                    results = results.sort_values(optimize_on, axis=0, ascending = False)
                    results.to_csv(os.path.join(outdir,'optimizer_results'))
                i += 1    
                    
    
    results = results.sort_values(optimize_on, axis=0, ascending = False)
    results.index = range(len(results.index))
    
    fmod = results.ix[results.index[0],'clf']
    fp = results.ix[results.index[0],'p']
    ffold = results.ix[results.index[0],'fold']
    
    opt_model = 'model: %s \n p: %s \n fold %s '%(fmod, fp, ffold)
    
    print('optimal model is as follows \n', opt_model)
    print('maximum validation accuracy:', results.ix[results.index[0],optimize_on])
    
    
    print(('*'*10, 'RUNNING OPTIMAL MODEL','*'*10))
    fmodel_output = kfold_feature_learning(train, test, y, t_y, 
                                            models[fmod], problem, ffold, 
                                            scale, True, fp, regcols, regdf, 
                                            keep_cols, output = output)
    
    return fmodel_output

### Build model matrix

In [436]:
losses = ['hinge','log', 'modified_huber', 'squared_hinge']
penalties = ['none', 'l2', 'l1', 'elasticnet']
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
models = {}
for loss in losses:
    print('working on loss', loss)
    for penalty in penalties:
        for alpha in alphas:
            modlab = '%s_%s_%s'%(loss, penalty, alpha)
            model = linear_model.SGDClassifier(loss, penalty, alpha, random_state = 123)
            models.update({modlab: model})

working on loss hinge
working on loss log
working on loss modified_huber
working on loss squared_hinge


### Run optimizer

In [395]:
out = '/Users/jakevogel/Dropbox/Work/Projects/ADNI_MCI_conv_classifier/'
model_output = feature_learning_optimizer(X_train, X_test, y_train, y_test, clfs = models, verbose = False,
                                            problem = 'classification', outdir = out, cheat = True)

running 4480 different tests
********** working on model hinge_l2_1 **********
***** p =  None *****
** using 2 fold cross-validation **
** using 3 fold cross-validation **
** using 5 fold cross-validation **
** using 10 fold cross-validation **
** using 20 fold cross-validation **
***** p =  0.2 *****
** using 2 fold cross-validation **
** using 3 fold cross-validation **
** using 5 fold cross-validation **
** using 10 fold cross-validation **
** using 20 fold cross-validation **
***** p =  0.1 *****
** using 2 fold cross-validation **
** using 3 fold cross-validation **
** using 5 fold cross-validation **
** using 10 fold cross-validation **
** using 20 fold cross-validation **
***** p =  0.05 *****
** using 2 fold cross-validation **
** using 3 fold cross-validation **
** using 5 fold cross-validation **
** using 10 fold cross-validation **
** using 20 fold cross-validation **
***** p =  0.01 *****
** using 2 fold cross-validation **
** using 3 fold cross-validation **
** using 5 fo

KeyError: 'model'

In [399]:
optimize_on = 'acc'

results = pandas.read_csv('/Users/jakevogel/Dropbox/Work/Projects/ADNI_MCI_conv_classifier/optimizer_results')
results = results.sort_values(optimize_on, axis=0, ascending = False)
results.index = range(len(results.index))
results.head()

Unnamed: 0.1,Unnamed: 0,clf,p,fold,acc,sens,spec,test_acc
0,254,hinge_l2_0.1,0.2,20,0.869888,0.694118,0.951087,0.817308
1,534,modified_huber_elasticnet_0.1,0.2,20,0.862454,0.682353,0.945652,0.831731
2,2179,log_l2_0.01,0.2,20,0.858736,0.694118,0.934783,0.826923
3,3999,log_elasticnet_0.01,0.2,20,0.858736,0.694118,0.934783,0.836538
4,4454,hinge_elasticnet_0.1,0.2,20,0.858736,0.658824,0.951087,0.8125


In [410]:
fmod = results.ix[results.index[0],'clf']
fp = results.ix[results.index[0],'p']
ffold = results.ix[results.index[0],'fold']
opt_model = 'model: %s \n p: %s \n fold %s '%(fmod, fp, ffold)

In [403]:
print('optimal model is as follows \n', opt_model)
print('maximum validation accuracy:', results.ix[results.index[0],optimize_on])

optimal model is as follows 
 model: hinge_l2_0.1 
 p: 0.2 
 fold 20 
maximum validation accuracy: 0.869888475836


In [414]:
odf.head()

Unnamed: 0.1,Unnamed: 0,clf,p,fold,acc,sens,spec,test_acc
0,254,hinge_l2_0.1,0.2,20,0.869888,0.694118,0.951087,0.817308
1,534,modified_huber_elasticnet_0.1,0.2,20,0.862454,0.682353,0.945652,0.831731
2,2179,log_l2_0.01,0.2,20,0.858736,0.694118,0.934783,0.826923
3,3999,log_elasticnet_0.01,0.2,20,0.858736,0.694118,0.934783,0.836538
4,4454,hinge_elasticnet_0.1,0.2,20,0.858736,0.658824,0.951087,0.8125


In [415]:
odf.sort_values('test_acc',axis=0,ascending=False).head()

Unnamed: 0.1,Unnamed: 0,clf,p,fold,acc,sens,spec,test_acc
1912,4112,modified_huber_l1_0.1,0.05,5,0.765799,0.505882,0.88587,0.855769
359,1874,hinge_elasticnet_0.001,0.05,20,0.814126,0.705882,0.86413,0.850962
589,4079,squared_hinge_l2_0.0001,0.05,20,0.806691,0.741176,0.836957,0.846154
338,3449,modified_huber_elasticnet_1e-05,0.05,20,0.817844,0.764706,0.842391,0.846154
364,2714,modified_huber_elasticnet_0.001,0.05,20,0.814126,0.705882,0.86413,0.846154


In [405]:
train = X_train 
test = X_test 
y = y_train 
t_y = y_test 
clf = models['hinge_l2_0.1']
problem = 'classification'
scale = True
regcols = None
regdf = None
keep_cols = None
output = 'light'

print(('*'*10, 'RUNNING OPTIMAL MODEL','*'*10))
fmodel_output = kfold_feature_learning(train, test, y, t_y, 
                                    clf, problem, ffold, 
                                    scale, fp, regcols, regdf, 
                                    keep_cols, output = output)



('**********', 'RUNNING OPTIMAL MODEL', '**********')
running model for fold 1 of 20
running model for fold 2 of 20
running model for fold 3 of 20
running model for fold 4 of 20
running model for fold 5 of 20
running model for fold 6 of 20
running model for fold 7 of 20
running model for fold 8 of 20
running model for fold 9 of 20
running model for fold 10 of 20
running model for fold 11 of 20
running model for fold 12 of 20
running model for fold 13 of 20
running model for fold 14 of 20
running model for fold 15 of 20
running model for fold 16 of 20
running model for fold 17 of 20
running model for fold 18 of 20
running model for fold 19 of 20
running model for fold 20 of 20
50 features selected
validation  sensitivity: 0.6941176470588235
validation specificity: 0.9239130434782609
validation accuracy: 0.8513011152416357
testing  sensitivity: 0.45098039215686275
testing specificity: 0.9426751592356688
testing accuracy: 0.8221153846153846


In [407]:
clf = models['modified_huber_elasticnet_0.1']
fmodel_output2 = kfold_feature_learning(train, test, y, t_y, 
                                    clf, problem, ffold, 
                                    scale, fp, regcols, regdf, 
                                    keep_cols, output = output)

running model for fold 1 of 20
running model for fold 2 of 20
running model for fold 3 of 20
running model for fold 4 of 20
running model for fold 5 of 20
running model for fold 6 of 20
running model for fold 7 of 20
running model for fold 8 of 20
running model for fold 9 of 20
running model for fold 10 of 20
running model for fold 11 of 20
running model for fold 12 of 20
running model for fold 13 of 20
running model for fold 14 of 20
running model for fold 15 of 20
running model for fold 16 of 20
running model for fold 17 of 20
running model for fold 18 of 20
running model for fold 19 of 20
running model for fold 20 of 20
47 features selected
validation  sensitivity: 0.6941176470588235
validation specificity: 0.9239130434782609
validation accuracy: 0.8513011152416357
testing  sensitivity: 0.5686274509803921
testing specificity: 0.9044585987261147
testing accuracy: 0.8221153846153846


In [409]:
clf = models['log_l2_0.01']
fmodel_output2 = kfold_feature_learning(train, test, y, t_y, 
                                    clf, problem, ffold, 
                                    scale, fp, regcols, regdf, 
                                    keep_cols, output = output)

running model for fold 1 of 20
running model for fold 2 of 20
running model for fold 3 of 20
running model for fold 4 of 20
running model for fold 5 of 20
running model for fold 6 of 20
running model for fold 7 of 20
running model for fold 8 of 20
running model for fold 9 of 20
running model for fold 10 of 20
running model for fold 11 of 20
running model for fold 12 of 20
running model for fold 13 of 20
running model for fold 14 of 20
running model for fold 15 of 20
running model for fold 16 of 20
running model for fold 17 of 20
running model for fold 18 of 20
running model for fold 19 of 20
running model for fold 20 of 20
50 features selected
validation  sensitivity: 0.7176470588235294
validation specificity: 0.9021739130434783
validation accuracy: 0.8438661710037175
testing  sensitivity: 0.5098039215686274
testing specificity: 0.8980891719745223
testing accuracy: 0.8028846153846154


In [411]:
clf = models['log_elasticnet_0.01']
fmodel_output2 = kfold_feature_learning(train, test, y, t_y, 
                                    clf, problem, ffold, 
                                    scale, fp, regcols, regdf, 
                                    keep_cols, output = output)

running model for fold 1 of 20
running model for fold 2 of 20
running model for fold 3 of 20
running model for fold 4 of 20
running model for fold 5 of 20
running model for fold 6 of 20
running model for fold 7 of 20
running model for fold 8 of 20
running model for fold 9 of 20
running model for fold 10 of 20
running model for fold 11 of 20
running model for fold 12 of 20
running model for fold 13 of 20
running model for fold 14 of 20
running model for fold 15 of 20
running model for fold 16 of 20
running model for fold 17 of 20
running model for fold 18 of 20
running model for fold 19 of 20
running model for fold 20 of 20
50 features selected
validation  sensitivity: 0.6941176470588235
validation specificity: 0.9184782608695652
validation accuracy: 0.8475836431226765
testing  sensitivity: 0.5490196078431373
testing specificity: 0.8980891719745223
testing accuracy: 0.8125


In [412]:
clf = models['hinge_elasticnet_0.1']
fmodel_output2 = kfold_feature_learning(train, test, y, t_y, 
                                    clf, problem, ffold, 
                                    scale, fp, regcols, regdf, 
                                    keep_cols, output = output)

running model for fold 1 of 20
running model for fold 2 of 20
running model for fold 3 of 20
running model for fold 4 of 20
running model for fold 5 of 20
running model for fold 6 of 20
running model for fold 7 of 20
running model for fold 8 of 20
running model for fold 9 of 20
running model for fold 10 of 20
running model for fold 11 of 20
running model for fold 12 of 20
running model for fold 13 of 20
running model for fold 14 of 20
running model for fold 15 of 20
running model for fold 16 of 20
running model for fold 17 of 20
running model for fold 18 of 20
running model for fold 19 of 20
running model for fold 20 of 20
41 features selected
validation  sensitivity: 0.6588235294117647
validation specificity: 0.9510869565217391
validation accuracy: 0.8587360594795539
testing  sensitivity: 0.43137254901960786
testing specificity: 0.9490445859872612
testing accuracy: 0.8221153846153846


In [438]:
train = X_train 
test = X_test 
y = y_train 
t_y = y_test 
clf = models['hinge_l2_1']
problem = 'classification'
outdir = out 
cheat = True
p = None
fold = 2
scale = True
jnk =  kfold_feature_learning(train, test, y, t_y, clf, problem, fold, scale, p, output = 'heavy')


In [441]:
rmod = jnk[-1]
rmod.

In [418]:
losses = ['hinge','log', 'modified_huber', 'squared_hinge']
penalties = ['none', 'l2', 'l1', 'elasticnet']
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1]
models = {}
for loss in losses:
    print('working on loss', loss)
    for penalty in penalties:
        for alpha in alphas:
            modlab = '%s_%s_%s'%(loss, penalty, alpha)
            model = linear_model.SGDClassifier(loss, penalty, alpha, random_state = 123)
            models.update({modlab: model})

working on loss hinge
working on loss log
working on loss modified_huber
working on loss squared_hinge


In [434]:
folds = [30, 50]
out = '/Users/jakevogel/Dropbox/Work/Projects/ADNI_MCI_conv_classifier/optimizer_highfold'
model_output = feature_learning_optimizer(X_train, X_test, y_train, y_test, clfs = models, verbose = False, ps=[None],
                                            folds = folds, problem = 'classification', outdir = out, cheat = True,
                                         output = 'heavy')

running 160 different tests
********** working on model modified_huber_l2_0.0001 **********
***** p =  None *****
** using 30 fold cross-validation **
** using 50 fold cross-validation **


KeyboardInterrupt: 

In [528]:
sys.path.insert(0,'/Users/jakevogel/git/hack_projects/')
import kfold_learning as kfl

In [433]:
linear_model.LassoCV?

In [442]:
from sklearn import model_selection

In [449]:
model_selection.GridSearchCV?

In [470]:
linear_model.SGDClassifier?

In [459]:
params1 = {'loss': ['hinge','log', 'modified_huber', 'squared_hinge', 'perceptron'],
          'penalty': ['none', 'l2', 'l1'],
          'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1]} 
params2 = {'loss': ['hinge','log', 'modified_huber', 'squared_hinge'],
          'penalty': ['elasticnet'],
          'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1],
          'l1_ratio': [0.05,0.1,0.15,0.2,0.3,0.5]} 

In [464]:
selector = model_selection.GridSearchCV(linear_model.SGDClassifier(random_state=123),[params1,params2],cv=10)

In [471]:
selected = selector.fit(X_train_scl,y_train)

In [477]:
jnk = selected.best_estimator_

In [478]:
jnk

SGDClassifier(alpha=0.1, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.1, learning_rate='optimal',
       loss='hinge', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5,
       random_state=123, shuffle=True, verbose=0, warm_start=False)

In [473]:
selected.best_score_

0.87732342007434949

In [469]:
selected.decision_function(X_train)

array([-15826.84975848,  -3052.63517232,  -6831.21073218, -11879.52565321,
       -14200.70091118, -10954.18258816,  -7784.47518973,   -696.55370199,
        -5853.4070997 ,  -9833.62218028,  -5812.31039412,  -8423.96837495,
        -6832.44332798,  -2233.11744147, -17167.76267435,  -3326.54801759,
       -12334.96222927,  -7119.1774772 ,  -8459.17188738,  -3856.66054231,
       -10912.21704117, -10106.1809877 ,  -8869.49176869,  -3007.91915078,
           60.55229195,  -6106.68329728, -10154.58381149,   4824.97605852,
       -10198.64161614,    -44.82302733, -16553.8657833 ,  -6376.03293557,
         1478.25338092,   -578.08296345,  -2217.97914297,  11074.1042724 ,
        -9431.12809941,   -668.94810844, -11237.69167615,  -3255.60742935,
        -2943.58169725,  -6194.81004753,  -1207.10923562,  -2057.76059375,
        -8413.72909211,    306.65692036,  -8501.44576606,  -7953.82125148,
        11575.34777705,  -6204.70625594,  -2167.69643798, -10483.56947846,
         5671.47779566,  

In [476]:
selected.score(X_test_scl, y_test)

0.80769230769230771

In [486]:
output = kfold_feature_learning(X_train, X_test, y_train, y_test, clf = selector, search = True,
                                p_cutoff=None,problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True)

running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
46 features selected
validation  sensitivity: 0.7058823529411765
validation specificity: 0.9239130434782609
validation accuracy: 0.8550185873605948
testing  sensitivity: 0.5490196078431373
testing specificity: 0.9235668789808917
testing accuracy: 0.8317307692307693


In [532]:
from importlib import reload

In [582]:
reload(kfl)

<module 'kfold_learning' from '/Users/jakevogel/git/hack_projects/kfold_learning.py'>

In [577]:
clf = RandomForestClassifier(random_state=123)
rf_output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = clf,
                                p_cutoff=None,problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True)

random forest detected, setting vote to 'soft'
running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
50 features selected
validation  sensitivity: 0.4588235294117647
validation specificity: 0.9184782608695652
validation accuracy: 0.7732342007434945
there are 16 ambiguous cases. Setting to misses...
ambiguous cases: [287, 303, 312, 315, 326, 360, 365, 372, 375, 379, 385, 428, 430, 449, 450, 454]
testing  sensitivity: 0.13725490196078433
testing specificity: 0.9554140127388535
testing accuracy: 0.7548076923076923


In [583]:
output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = selector, search = True,
                                p_cutoff=None,problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True,
                               vote = 'hard')

running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
46 features selected
validation  sensitivity: 0.7058823529411765
validation specificity: 0.9239130434782609
validation accuracy: 0.8550185873605948
testing  sensitivity: 0.5098039215686274
testing specificity: 0.9363057324840764
testing accuracy: 0.8317307692307693


In [587]:
print(y_train.mean())
print(y_test.mean())

0.3159851301115242
0.24519230769230768


In [595]:
convs = y_test[y_test == 1].index.tolist()
nonconvs = np.random.permutation(y_test[y_test == 0].index.tolist())
bal_idx = np.random.permutation(convs + nonconvs[:len(convs)].tolist())

bal_x_test = X_test.loc[bal_idx]
bal_y_test = y_test.loc[bal_idx]

In [596]:
bal_output = kfl.kfold_feature_learning(X_train, bal_x_test, y_train, bal_y_test, clf = selector, search = True,
                                p_cutoff=None,problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True,
                               vote = 'hard')

running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
46 features selected
validation  sensitivity: 0.7058823529411765
validation specificity: 0.9239130434782609
validation accuracy: 0.8550185873605948
testing  sensitivity: 0.5098039215686274
testing specificity: 0.9607843137254902
testing accuracy: 0.7352941176470589


In [597]:
from sklearn import ensemble

In [598]:
ensemble.AdaBoostClassifie

In [605]:
clf = ensemble.AdaBoostClassifier(random_state=123)
rf_output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = clf, vote = 'soft',
                                p_cutoff=None, problem = 'classification', folds = 10, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True)

running model for fold 1 of 10
running model for fold 2 of 10
running model for fold 3 of 10
running model for fold 4 of 10
running model for fold 5 of 10
running model for fold 6 of 10
running model for fold 7 of 10
running model for fold 8 of 10
running model for fold 9 of 10
running model for fold 10 of 10
48 features selected
validation  sensitivity: 0.611764705882353
validation specificity: 0.8315217391304348
validation accuracy: 0.7620817843866171
testing  sensitivity: 0.43137254901960786
testing specificity: 0.8853503184713376
testing accuracy: 0.7740384615384616


In [609]:
clf = ensemble.ExtraTreesClassifier(random_state=123)
rf_output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = clf, vote = 'soft',
                                p_cutoff=None, problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True)

random forest detected, setting vote to 'soft'
running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
50 features selected
validation  sensitivity: 0.3764705882352941
validation specificity: 0.9347826086956522
validation accuracy: 0.758364312267658
there are 13 ambiguous cases. Setting to hits...
ambiguous cases: [273, 312, 326, 332, 336, 358, 379, 381, 389, 404, 407, 423, 452]
testing  sensitivity: 0.27450980392156865
testing specificity: 0.9363057324840764
testing accuracy: 0.7740384615384616


In [618]:
clf = ensemble.GradientBoostingClassifier(random_state=123)
rf_output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = clf, vote = 'soft',
                                p_cutoff=0.05, problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True)

random forest detected, setting vote to 'soft'
running ttests for fold 1 of 3
running model for fold 1 of 3
running ttests for fold 2 of 3
running model for fold 2 of 3
running ttests for fold 3 of 3
running model for fold 3 of 3
35 features selected
validation  sensitivity: 0.5411764705882353
validation specificity: 0.9239130434782609
validation accuracy: 0.8029739776951673
testing  sensitivity: 0.35294117647058826
testing specificity: 0.9745222929936306
testing accuracy: 0.8221153846153846


### Now trying with weighted folds

In [637]:
reload(kfl)

<module 'kfold_learning' from '/Users/jakevogel/git/hack_projects/kfold_learning.py'>

In [634]:
output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = selector, search = True,
                                p_cutoff=None,problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True,
                               vote = 'hard', weighted = True)

running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
46 features selected
validation  sensitivity: 0.7058823529411765
validation specificity: 0.9239130434782609
validation accuracy: 0.8550185873605948
testing  sensitivity: 0.5098039215686274
testing specificity: 0.9363057324840764
testing accuracy: 0.8317307692307693


In [635]:
output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = selector, search = True,
                                p_cutoff=None,problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True,
                               vote = None, weighted = True)

running model for fold 1 of 3
running model for fold 2 of 3
running model for fold 3 of 3
46 features selected
validation  sensitivity: 0.7058823529411765
validation specificity: 0.9239130434782609
validation accuracy: 0.8550185873605948
testing  sensitivity: 0.43137254901960786
testing specificity: 0.9490445859872612
testing accuracy: 0.8221153846153846


In [645]:
clf = ensemble.GradientBoostingClassifier(random_state=123)
rf_output = kfl.kfold_feature_learning(X_train, X_test, y_train, y_test, clf = clf, vote = 'soft',
                                p_cutoff=0.05, problem = 'classification', folds = 3, output='heavy', 
                                scale=True, regcols = None, regdf = None, keep_cols = None, save_int = True,
                                      weighted = True)

random forest detected, setting vote to 'soft'
running ttests for fold 1 of 3
running model for fold 1 of 3
running ttests for fold 2 of 3
running model for fold 2 of 3
running ttests for fold 3 of 3
running model for fold 3 of 3
35 features selected
validation  sensitivity: 0.5411764705882353
validation specificity: 0.9239130434782609
validation accuracy: 0.8029739776951673
testing  sensitivity: 0.35294117647058826
testing specificity: 0.9745222929936306
testing accuracy: 0.8221153846153846
