In [1]:
import pandas as pd
import numpy as np
import sklearn
import scipy

In [2]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLarsCV
from sklearn.datasets import make_regression

In [6]:
data = read_csv('Lung 1 radiomics training set.csv')

In [7]:
X = data.drop(['survivaltime'], axis=1)

In [8]:
y = data["survivaltime"]

In [9]:
y

0      2165
1       155
2       256
3       141
4       353
       ... 
415     346
416    2772
417    2429
418     369
419    1590
Name: survivaltime, Length: 420, dtype: int64

In [10]:
corrMatrix = X.corr()
allVars = corrMatrix.keys()

absCorrWithDep = []
for var in allVars:
    absCorrWithDep.append(abs(y.corr(X[var])))
    
corrTol = 0.7

# for each column in the corr matrix
for col in corrMatrix:
    
    if col in corrMatrix.keys():
        thisCol = []
        thisVars = []
        temp = corrMatrix[col]
        
        # Store the corr with the dep var for fields that are highly correlated with each other
        for i in range(len(corrMatrix)):
            
            if abs(corrMatrix[col][i]) == 1.0 and col != corrMatrix.keys()[i]:
                thisCorr = 0
            else:
                thisCorr = (1 if abs(corrMatrix[col][i]) > corrTol else -1) * abs(temp[corrMatrix.keys()[i]])
            thisCol.append(thisCorr)
            thisVars.append(corrMatrix.keys()[i])
        
        mask = np.ones(len(thisCol), dtype = bool) # Initialize the mask
        
        ctDelCol = 0 # To keep track of the number of columns deleted
        
        for n, j in enumerate(thisCol):
            # Delete if (a) a var is correlated withh others and do not ave the best corr with dep,
            # or (b) completely corr with the 'col'
            mask[n] = not (j != max(thisCol) and j >= 0)
            
            if j != max(thisCol) and j >= 0:
                # Delete the column from the corr matrix
                corrMatrix.pop('%s' %thisVars[n])
                ctDelCol += 1
                
        # Delete the corresponding row(s) from the corr matrix
        corrMatrix = corrMatrix[mask]

In [11]:
len(corrMatrix.columns)

95

In [12]:
corrMatrix.columns

Index(['ori-shapeVoxelVolume', 'ori-shapeMajorAxisLength',
       'ori-shapeSphericity', 'ori-shapeElongation',
       'ori-shapeSurfaceVolumeRatio', 'ori-gldmGrayLevelVariance',
       'ori-gldmHighGrayLevelEmphasis', 'ori-gldmDependenceEntropy',
       'ori-gldmLargeDependenceEmphasis',
       'ori-gldmLargeDependenceLowGrayLevelEmphasis', 'ori-glcmCorrelation',
       'ori-glcmImc2', 'ori-firstorderSkewness', 'ori-firstorderMedian',
       'ori-firstorderEnergy', 'ori-firstorderMaximum',
       'ori-firstorder90Percentile', 'ori-firstorderMinimum',
       'ori-firstorderKurtosis', 'ori-glrlmShortRunLowGrayLevelEmphasis',
       'ori-glszmZoneVariance', 'ori-glszmSizeZoneNonUniformityNormalized',
       'ori-ngtdmStrength', 'ori-ngtdmContrast',
       'wavelet-HLLgldmHighGrayLevelEmphasis',
       'wavelet-HLLgldmLargeDependenceLowGrayLevelEmphasis',
       'wavelet-HLLgldmLargeDependenceHighGrayLevelEmphasis',
       'wavelet-HLLgldmLowGrayLevelEmphasis', 'wavelet-HLLglcmClusterShad

In [13]:
data[corrMatrix.columns]

Unnamed: 0,ori-shapeVoxelVolume,ori-shapeMajorAxisLength,ori-shapeSphericity,ori-shapeElongation,ori-shapeSurfaceVolumeRatio,ori-gldmGrayLevelVariance,ori-gldmHighGrayLevelEmphasis,ori-gldmDependenceEntropy,ori-gldmLargeDependenceEmphasis,ori-gldmLargeDependenceLowGrayLevelEmphasis,...,wavelet-HHLglcmMCC,wavelet-HHLfirstorderSkewness,wavelet-HHLfirstorderMedian,wavelet-HHLfirstorderMean,wavelet-HHLglszmSizeZoneNonUniformityNormalized,wavelet-HHLglszmLowGrayLevelZoneEmphasis,wavelet-LLLgldmLargeDependenceLowGrayLevelEmphasis,wavelet-LLLgldmLargeDependenceHighGrayLevelEmphasis,wavelet-LLLglcmMCC,wavelet-LLLfirstorderVariance
0,156323.43290,84.238088,0.601997,0.730057,0.149209,65.890341,1583.334962,7.055278,64.282088,0.102407,...,0.164230,-0.020109,-0.061794,0.001454,0.343413,0.046005,0.002572,407270.5431,0.784893,296209.9611
1,359468.93970,92.771555,0.688025,0.880026,0.098881,88.444688,1651.377715,7.413443,76.437748,0.052014,...,0.338236,0.004120,-0.072820,0.040810,0.338172,0.013171,0.003611,728457.3600,0.914684,416983.2558
2,34818.35433,72.068431,0.554442,0.541368,0.267595,99.383954,1355.354470,7.284141,25.568303,0.019421,...,0.272501,0.014406,0.032556,0.039549,0.412444,0.029951,0.001756,156217.7649,0.784744,437040.9444
3,84460.25848,107.274834,0.487099,0.490820,0.226502,43.820066,1533.333966,7.184589,40.228007,0.026679,...,0.693576,0.078155,-0.103746,0.063471,0.362621,0.014503,0.002081,312064.5537,0.938403,196763.5015
4,83548.01431,78.568657,0.469560,0.710922,0.235959,56.427838,1785.882712,7.245794,43.938237,0.027079,...,0.446951,0.109717,-0.034239,0.073095,0.367110,0.011779,0.001697,361506.2058,0.800076,241866.1790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,94782.82928,77.260243,0.572603,0.956116,0.185361,42.388159,1691.335476,6.821508,101.637206,0.073470,...,0.535032,1.316625,-0.003670,0.053653,0.343274,0.007007,0.002881,746357.6996,0.858213,194123.6959
416,132505.41690,87.610311,0.576435,0.642440,0.164681,28.020261,1660.491514,6.997304,107.774755,0.062325,...,0.205164,-0.017763,0.035327,0.060716,0.346115,0.078278,0.003506,510071.9047,0.704937,134096.4671
417,18980.02625,35.624951,0.757406,0.873311,0.239817,46.072588,1103.694754,6.972423,88.975580,0.069912,...,0.167609,0.103094,-0.169767,0.164422,0.354962,0.156299,0.003382,345312.7003,0.712573,219888.8339
418,42237.28180,51.751822,0.677408,0.775330,0.205271,73.889902,1516.309896,7.052117,95.202262,0.078652,...,0.349003,0.119479,0.036405,0.076638,0.310309,0.098469,0.005374,471769.8397,0.806936,350011.5605


In [14]:
data1 = data[corrMatrix.columns]

In [15]:
data1.to_csv("Training set 95 radiomics features after Pearson correlation pairwise selection.csv",index=None)