In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import scipy as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc
from sklearn.feature_selection import SelectKBest, SelectFromModel, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import seaborn as sns

In [3]:
df = pd.read_excel(r"C:\Users\e.lavrova\Desktop\DataAll.xlsx", index_col = 0)

In [24]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

In [28]:
for train_index, test_index in sss.split(df, df.Outcome):
    X_train, X_test = df.iloc[train_index], df.iloc[test_index]
    y_train, y_test = df.iloc[train_index]['Outcome'], df.iloc[test_index]['Outcome']

In [40]:
len(X_train), len(X_test), len(y_train), len(y_test) 

(1254, 314, 1254, 314)

In [30]:
def selectNonIntercorrelated(df_in, ftrs, outc, corr_th):
    
    # selection of the features, which are not 'highly intercorrelated' (correlation is defined by Spearman coefficient);
    # pairwise correlation between all the features is calculated, 
    # from each pair of features, considered as intercorrelated, 
    # feature with maximum sum of all the pairwise Spearman correlation coefficients is a 'candidate to be dropped'
    # for stability of the selected features, bootstrapping approach is used: 
    # in each bootstrap split, the random subsample, stratified in relation to outcome, 
    # is formed, based on original observations from input dataset;
    # in each bootstrap split, 'candidates to be dropped' are detected;
    # for each input feature, its frequency to appear as 'candidate to be dropped' is calculated,
    # features, appeared in 50 % of splits as 'candidate to be dropped', are excluded from feature set
    
    # input:
    # df_in - input dataframe, containing feature values (dataframe, columns = features, rows = observations),
    # ftrs - list of dataframe features, used in analysis (list of feature names - string variables),
    # outc - name of dataframe outcome variable (string),
    # corr_th - threshold for Spearman correlation coefficient, defining each pair of features as intercorrelated (float)
    
    # output:
    # non_intercorrelated_features - list of names of features, which did not appear as 'candidated to e dropped' 
    # in 50 % of splits (list of string) 
    
    corr_matrix = df_in.corr(method='spearman').abs()
    mean_absolute_corr = corr_matrix.mean()
    intercorrelated_features_set = []
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    high_corrs = upper.where(upper > corr_th).dropna(how='all', axis=1).dropna(how='all', axis=0)

    for feature in high_corrs.columns:
        mean_absolute_main = mean_absolute_corr[feature]
        correlated_with_feature = high_corrs[feature].index[pd.notnull(high_corrs[feature])]
        for each_correlated_feature in correlated_with_feature:
            mean_absolute = mean_absolute_corr[each_correlated_feature]
            if mean_absolute_main > mean_absolute:
                if feature not in intercorrelated_features_set:
                    intercorrelated_features_set.append(feature)
            else:
                if each_correlated_feature not in intercorrelated_features_set:
                    intercorrelated_features_set.append(each_correlated_feature)

    non_intercorrelated_features_set = [e for e in ftrs if e not in intercorrelated_features_set] 
    
    print ('Non intercorrelated features: ', non_intercorrelated_features_set)
    
    return non_intercorrelated_features_set

In [55]:
f = selectNonIntercorrelated(df, df.columns[2:], df.Outcome, 0.90)

Non intercorrelated features:  ['Fractal_average', 'Fractal_sd', 'GLCM_clusProm', 'GLCM_diffVar', 'GLCM_infoCorr2', 'GLCM_invDiffMomNor', 'GLCM_maxCorr', 'GLDZM_DZE', 'GLDZM_DZN', 'GLDZM_DZNN', 'GLDZM_HILDE', 'GLDZM_HISDE', 'GLDZM_SDE', 'GLRLM_GLV', 'GLSZM_HILAE', 'GLSZM_HISAE', 'GLSZM_INN', 'GLSZM_IV', 'GLSZM_SAE', 'GLSZM_SZN', 'GLSZM_ZE', 'IH_cov', 'IH_energy', 'IH_iqr', 'IH_maxGrad', 'IH_maxGradI', 'IH_medianD', 'IH_min', 'IH_qcod', 'IH_range', 'LocInt_peakGlobal', 'LocInt_peakLocal', 'NGLDM_DN', 'NGLDM_DV', 'NGLDM_LGSDE', 'NGTDM_busyness', 'NGTDM_contrast', 'NGTDM_strength', 'Stats_median', 'Stats_min', 'Stats_p10']


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [56]:
f.append('Outcome')

In [58]:
len(f)

42

In [59]:
df[f]

Unnamed: 0_level_0,Fractal_average,Fractal_sd,GLCM_clusProm,GLCM_diffVar,GLCM_infoCorr2,GLCM_invDiffMomNor,GLCM_maxCorr,GLDZM_DZE,GLDZM_DZN,GLDZM_DZNN,...,NGLDM_DN,NGLDM_DV,NGLDM_LGSDE,NGTDM_busyness,NGTDM_contrast,NGTDM_strength,Stats_median,Stats_min,Stats_p10,Outcome
General_PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c5s,2.813451,0.321175,331069.0514,14.718144,0.687739,0.996440,0.755512,9.538700,29981.47936,0.050823,...,394339.3609,5.686929,0.004858,404.125989,0.040165,0.024143,-835.359575,-1024.000000,-925.374944,0
blh,2.707012,0.382505,455468.2833,17.522910,0.647886,0.997500,0.766903,9.676704,23497.80115,0.047231,...,324353.9989,10.046534,0.006465,414.219483,0.017624,0.078751,-895.023577,-1024.000000,-956.429189,0
bl7,2.743963,0.361328,418126.0490,16.025600,0.635789,0.995095,0.755787,9.399565,23416.98858,0.058048,...,271989.1541,8.253220,0.005720,437.513628,0.035353,0.031286,-872.248661,-1024.000000,-941.601103,0
c7q,2.775288,0.353645,384201.4824,15.005784,0.656585,0.996956,0.747660,9.381632,25330.24984,0.060267,...,297826.6352,8.057091,0.006793,389.855258,0.022627,0.056936,-886.495480,-1024.000000,-956.590460,0
crr,2.704704,0.384258,368110.0345,18.873214,0.585324,0.997719,0.730153,9.556085,16610.22351,0.045294,...,238303.0407,9.796857,0.006093,341.835469,0.020170,0.070329,-876.905111,-1024.000000,-941.982032,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CD-Emphysema-213,2.701500,0.398347,631380.1620,21.182347,0.744052,0.994270,0.769572,9.880251,25744.46293,0.045471,...,331128.2053,10.656161,0.001196,411.487829,0.056185,0.017196,-916.093980,-1127.062137,-985.868053,3
CD-Emphysema-214,2.688999,0.399328,696309.2621,22.447844,0.658174,0.997517,0.756590,9.616351,22591.12594,0.049050,...,299303.7426,9.566123,0.001732,252.406366,0.018047,0.107661,-929.707807,-1108.283786,-989.305837,3
CD-Emphysema-215,2.760121,0.363605,344902.1814,22.390808,0.673695,0.992205,0.667105,9.468194,28328.02760,0.055563,...,296338.0050,6.574283,0.001094,222.458526,0.056372,0.015540,-792.645690,-1113.129107,-885.206391,3
CD-Emphysema-217,2.667280,0.404107,504294.0034,18.251871,0.662241,0.997048,0.751549,9.614913,24052.39192,0.052994,...,296514.6787,11.605443,0.001127,316.448350,0.020288,0.052818,-925.974157,-1125.950746,-982.252038,3
