# Feature Extraction

In [1]:
import pandas as pd
import numpy as np
import nibabel as nib
from scipy import ndimage as nd
from scipy import stats
import time
from os.path import join

pd.set_option('display.max_columns', 100)

## load images into dataframe

In [2]:
df = pd.read_csv(join('..','csvs','GT_Training.csv'), header=None, names=['file','label'])
df.file = df.file.map(lambda x: x.replace("'",''))
df['img'] = df.file.map(lambda x: nib.load(join('..','data',str(x)+'.nii.gz')))

In [3]:
df = df[['file','label','img']]
df.head()

Unnamed: 0,file,label,img
0,I004_1,0.0,<class 'nibabel.nifti1.Nifti1Image'>\ndata sha...
1,I004_2,0.0,<class 'nibabel.nifti1.Nifti1Image'>\ndata sha...
2,I004_3,1.0,<class 'nibabel.nifti1.Nifti1Image'>\ndata sha...
3,I005_1,0.0,<class 'nibabel.nifti1.Nifti1Image'>\ndata sha...
4,I005_2,0.0,<class 'nibabel.nifti1.Nifti1Image'>\ndata sha...


## create columns in dataframe for new features

In [58]:
hist_labels = ['hist' + str(x) for x in range(0,10)]
new_features = 'x y z xyz xcom ycom zcom max xmax ymax zmax mean median std variance skew kurtosis entropy'.split() + hist_labels
try:
  df = df.reindex(columns = df.columns.tolist() + new_features)
except: 
    pass
print(len(new_features),'new features')
print(new_features)

38 new features
['x', 'y', 'z', 'xyz', 'xcom', 'ycom', 'zcom', 'max', 'xmax', 'ymax', 'zmax', 'mean', 'median', 'std', 'variance', 'skew', 'kurtosis', 'entropy', 'hist0', 'hist1', 'hist2', 'hist3', 'hist4', 'hist5', 'hist6', 'hist7', 'hist8', 'hist9', 'hist_norm0', 'hist_norm1', 'hist_norm2', 'hist_norm3', 'hist_norm4', 'hist_norm5', 'hist_norm6', 'hist_norm7', 'hist_norm8', 'hist_norm9']


## calculate features for each example image

In [103]:
start = time.time()

features_shape, features_stats, features_histogram = [], [], []

for i, img in enumerate(df.img):
    
    if i % 100 == 0 and i != 0: print('i=',i,'\ttime=',time.time()-start)
        
    data = img.get_fdata()
    nonzero = data[np.nonzero(data)]
    
    x,y,z = img.shape
    num_pixels = np.
    max_dim = np.max(img.shape)
    mid_dim = np.median(img.shape)
    min_dim = np.min(img.shape)
    
    histogram = nd.histogram(nonzero,0,2700,4)
    
    mean = nd.mean(nonzero)
    median = nd.median(nonzero)
    max_ = np.max(nonzero)
    std = nd.standard_deviation(nonzero)
    var = nd.variance(nonzero)
    
    skew = stats.skew(nonzero,axis=None)
    kurtosis = stats.kurtosis(nonzero,axis=None)
    entropy = stats.entropy(histogram)
    
    features_shape.append([x,y,z,num_pixels,max_dim,mid_dim,min_dim])
    features_stats.append([mean,median,max_,std,var,skew,kurtosis,entropy])
    features_histogram.append(list(histogram))
    
    #df.loc[df.index==i,new_features] = features

print('Done with {:d} rows. Total time = {:<10.3g}'.format(len(df),time.time()-start))
np.save('features_shape.npy',    np.array(features_shape))
np.save('features_stats.npy',    np.array(features_stats))
np.save('features_histogram.npy',np.array(features_histogram))

i= 100 	time= 7.0172364711761475
i= 200 	time= 17.82236647605896
i= 300 	time= 24.16241192817688
i= 400 	time= 29.225863933563232
i= 500 	time= 36.00471043586731
i= 600 	time= 43.61034655570984
i= 700 	time= 51.5311758518219
i= 800 	time= 58.77179002761841
i= 900 	time= 65.00311827659607
i= 1000 	time= 71.42198324203491
i= 1100 	time= 77.42193341255188
i= 1200 	time= 85.82844519615173
i= 1300 	time= 93.41810607910156
i= 1400 	time= 102.58638954162598
Done with 1472 rows. Total time = 111       
