In [1]:
%pylab inline
import cv2
import numpy as np
import dicom
import pandas as pd



Populating the interactive namespace from numpy and matplotlib


## Meta data handling

In [2]:
tbl_image = pd.read_csv('./metadata/images_crosswalk.tsv', sep="\t")
tbl_image.head()

Unnamed: 0,subjectId,examIndex,imageIndex,view,laterality,filename,cancer
0,20,1,1,CC,R,000135.dcm,0
1,20,1,2,CC,L,000136.dcm,0
2,20,1,3,MLO,L,000137.dcm,0
3,20,1,4,MLO,R,000138.dcm,0
4,98,1,1,CC,R,100151.dcm,0


In [3]:
tbl_image_view_indexed = tbl_image.set_index('view')
tbl_image_view_indexed.index.unique()

array(['CC', 'MLO', 'CV', 'CCID', 'MLOID', 'XCCL'], dtype=object)

In [4]:
tbl_image_view_indexed.loc['CV']

Unnamed: 0_level_0,subjectId,examIndex,imageIndex,laterality,filename,cancer
view,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CV,836,1,5,R,105549.dcm,0
CV,55254,3,5,L,578699.dcm,0
CV,56779,2,5,R,587072.dcm,0


In [5]:
tbl_image_view_indexed.loc['CCID']

Unnamed: 0_level_0,subjectId,examIndex,imageIndex,laterality,filename,cancer
view,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CCID,21107,1,5,R,507746.dcm,0
CCID,21107,1,6,L,507754.dcm,0
CCID,21107,2,5,R,267631.dcm,1
CCID,21107,2,6,L,267634.dcm,0
CCID,21107,2,9,R,267643.dcm,1
CCID,21107,2,10,L,267646.dcm,0


In [6]:
tbl_exam = pd.read_csv('./metadata/exams_metadata.tsv', sep="\t")
tbl_exam.head()

Unnamed: 0,subjectId,examIndex,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,...,yearsSincePreviousBc,previousBcLaterality,reduxHistory,reduxLaterality,hrt,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race
0,20,1,0,0,0,0,0,66,.,.,...,.,.,.,.,0,0,0,0,.,1
1,98,1,0,1,0,0,0,40,0,.,...,.,.,0,.,0,0,0,0,31.4735,7
2,836,1,0,0,0,0,0,61,.,.,...,.,.,.,.,9,9,9,9,.,1
3,836,2,371,0,0,0,0,62,0,.,...,.,.,0,.,0,0,1,0,21.8095,1
4,1626,1,0,1,0,1,0,71,0,.,...,.,.,0,.,0,0,1,0,27.3686,1


In [7]:
tbl_exam.columns

Index([u'subjectId', u'examIndex', u'daysSincePreviousExam', u'cancerL',
       u'cancerR', u'invL', u'invR', u'age', u'implantEver', u'implantNow',
       u'bcHistory', u'yearsSincePreviousBc', u'previousBcLaterality',
       u'reduxHistory', u'reduxLaterality', u'hrt', u'antiestrogen',
       u'firstDegreeWithBc', u'firstDegreeWithBc50', u'bmi', u'race'],
      dtype='object')

In [8]:
tbl_exam_indexed = tbl_exam.set_index(['subjectId', 'examIndex'])
tbl_exam_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,previousBcLaterality,reduxHistory,reduxLaterality,hrt,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race
subjectId,examIndex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
20,1,0,0,0,0,0,66,.,.,0,.,.,.,.,0,0,0,0,.,1
98,1,0,1,0,0,0,40,0,.,0,.,.,0,.,0,0,0,0,31.4735,7
836,1,0,0,0,0,0,61,.,.,0,.,.,.,.,9,9,9,9,.,1
836,2,371,0,0,0,0,62,0,.,0,.,.,0,.,0,0,1,0,21.8095,1
1626,1,0,1,0,1,0,71,0,.,0,.,.,0,.,0,0,1,0,27.3686,1
1626,2,1001,0,0,0,0,74,.,.,1,2.5,2,0,.,0,1,1,0,22.8072,1
2642,1,0,0,0,0,0,61,0,.,0,.,.,0,.,0,0,0,0,47.9252,1
2642,2,826,0,0,0,0,63,0,.,0,.,.,0,.,0,0,0,0,35.9439,1
3642,1,0,1,0,0,0,67,.,.,0,.,.,.,.,0,0,0,0,42.9330,1
5424,1,0,1,0,1,0,40,0,.,0,.,.,0,.,0,0,9,9,24.2168,1


In [9]:
tbl_exam_indexed.index

MultiIndex(levels=[[20, 98, 836, 1626, 2642, 3642, 5424, 7856, 9156, 10090, 10916, 12562, 16585, 17459, 20827, 20934, 21107, 21896, 26102, 26434, 29374, 29504, 29579, 30996, 31175, 33162, 40034, 42366, 43184, 46211, 46273, 46484, 47041, 50570, 50985, 51286, 51323, 53538, 53559, 55254, 55578, 56570, 56779, 56960, 57279, 57571, 58554, 58950, 59265, 64073, 64872, 65228, 65313, 65725, 66628, 67906, 72548, 76540], [1, 2, 3, 4, 5]],
           labels=[[0, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 8, 8, 9, 10, 11, 11, 12, 12, 12, 12, 13, 14, 15, 15, 15, 16, 16, 17, 18, 18, 18, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 23, 24, 25, 26, 26, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 57], [0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 2, 3, 0, 0, 0, 1, 2, 0, 1, 0, 0, 1, 2, 3, 4, 0, 1, 0, 1,

In [10]:
print 'Number of patients:', len(tbl_exam_indexed.index.levels[0])
print 'Max number of exams:', len(tbl_exam_indexed.index.levels[1])
print 'Number of patient X exam:', len(tbl_exam_indexed.index.labels[0])

Number of patients: 58
Max number of exams: 5
Number of patient X exam: 111


In [11]:
tbl_image_indexed = tbl_image.set_index(['subjectId', 'examIndex'])
tbl_image_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,imageIndex,view,laterality,filename,cancer
subjectId,examIndex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20,1,1,CC,R,000135.dcm,0
20,1,2,CC,L,000136.dcm,0
20,1,3,MLO,L,000137.dcm,0
20,1,4,MLO,R,000138.dcm,0
98,1,1,CC,R,100151.dcm,0
98,1,2,CC,L,100152.dcm,1
98,1,3,MLO,L,100153.dcm,1
98,1,4,MLO,R,100154.dcm,0
836,1,1,CC,R,105545.dcm,0
836,1,2,CC,L,105546.dcm,0


In [12]:
tbl_exam_img_indexed = tbl_exam_indexed.join(tbl_image_indexed)
tbl_exam_img_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,imageIndex,view,laterality,filename,cancer
subjectId,examIndex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20,1,0,0,0,0,0,66,.,.,0,.,...,0,0,0,.,1,1,CC,R,000135.dcm,0
20,1,0,0,0,0,0,66,.,.,0,.,...,0,0,0,.,1,2,CC,L,000136.dcm,0
20,1,0,0,0,0,0,66,.,.,0,.,...,0,0,0,.,1,3,MLO,L,000137.dcm,0
20,1,0,0,0,0,0,66,.,.,0,.,...,0,0,0,.,1,4,MLO,R,000138.dcm,0
98,1,0,1,0,0,0,40,0,.,0,.,...,0,0,0,31.4735,7,1,CC,R,100151.dcm,0
98,1,0,1,0,0,0,40,0,.,0,.,...,0,0,0,31.4735,7,2,CC,L,100152.dcm,1
98,1,0,1,0,0,0,40,0,.,0,.,...,0,0,0,31.4735,7,3,MLO,L,100153.dcm,1
98,1,0,1,0,0,0,40,0,.,0,.,...,0,0,0,31.4735,7,4,MLO,R,100154.dcm,0
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,1,CC,R,105545.dcm,0
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,2,CC,L,105546.dcm,0


In [64]:
tbl_exam_img_indexed.loc[(64073,)]

Unnamed: 0_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,imageIndex,view,laterality,filename,cancer
examIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,.,0,.,0,63,.,.,1,10.5,...,0,0,0,22.7099,1,1,CC,R,608401.dcm,0
1,0,.,0,.,0,63,.,.,1,10.5,...,0,0,0,22.7099,1,2,MLO,R,608406.dcm,0


In [13]:
print 'cancerL levels:', unique(tbl_exam_img_indexed.cancerL)
print 'cancerR levels:', unique(tbl_exam_img_indexed.cancerR)

cancerL levels: ['.' '0' '1']
cancerR levels: [0 1]


In [14]:
print 'Number of masked cancerL:', sum(tbl_exam_img_indexed.cancerL == '.')

Number of masked cancerL: 2


In [15]:
tmp = tbl_exam_img_indexed.head(20)
for idx, dat in tmp.iterrows():
    img_name = dat['filename']
    view = dat['view']
    laterality = dat['laterality']
    cancer = dat['cancerL'] if laterality == 'L' else dat['cancerR']
    print 'filename:', img_name, 'view:', view, 'cancer:', cancer

filename: 000135.dcm view: CC cancer: 0
filename: 000136.dcm view: CC cancer: 0
filename: 000137.dcm view: MLO cancer: 0
filename: 000138.dcm view: MLO cancer: 0
filename: 100151.dcm view: CC cancer: 0
filename: 100152.dcm view: CC cancer: 1
filename: 100153.dcm view: MLO cancer: 1
filename: 100154.dcm view: MLO cancer: 0
filename: 105545.dcm view: CC cancer: 0
filename: 105546.dcm view: CC cancer: 0
filename: 105547.dcm view: MLO cancer: 0
filename: 105548.dcm view: MLO cancer: 0
filename: 105549.dcm view: CV cancer: 0
filename: 105550.dcm view: CC cancer: 0
filename: 105551.dcm view: CC cancer: 0
filename: 105552.dcm view: MLO cancer: 0
filename: 105553.dcm view: MLO cancer: 0
filename: 111358.dcm view: CC cancer: 0
filename: 111359.dcm view: CC cancer: 1
filename: 111360.dcm view: MLO cancer: 1


- read_flatten_images - for pretraining.
- read_flatten_images_with_view - for SC1.
- read_last_images_with_meta - for SC2.

In [16]:
100000 * 20 * 8 / 1024**2

15

In [17]:
from os.path import basename, splitext
filename = '105546.dcm.gz'
print basename(filename)
print splitext(filename)

105546.dcm.gz
('105546.dcm', '.gz')


In [18]:
type(tbl_exam_img_indexed.filename)

pandas.core.series.Series

In [19]:
type(tbl_exam_img_indexed['filename'])

pandas.core.series.Series

In [20]:
tbl_exam_img_indexed['filename'].apply(lambda x: splitext(x)[0] + '.png')

subjectId  examIndex
20         1            000135.png
           1            000136.png
           1            000137.png
           1            000138.png
98         1            100151.png
           1            100152.png
           1            100153.png
           1            100154.png
836        1            105545.png
           1            105546.png
           1            105547.png
           1            105548.png
           1            105549.png
           2            105550.png
           2            105551.png
           2            105552.png
           2            105553.png
1626       1            111358.png
           1            111359.png
           1            111360.png
           1            111361.png
           2            111362.png
           2            111363.png
           2            111364.png
           2            111365.png
2642       1            121370.png
           1            121373.png
           1            121377.png

In [21]:
from meta import DMMetaManager
dm_meta_man = DMMetaManager(exam_tsv='metadata/exams_metadata.tsv', 
                            img_tsv='metadata/images_crosswalk.tsv', 
                            img_folder='preprocessedData/jpg_prep', 
                            img_extension='png')

In [22]:
train_list = dm_meta_man.get_flatten_img_list()
print type(train_list)
print len(train_list)
print sum(array(train_list[1]) == 1)
print sum(array(train_list[1]) == 0)

<type 'tuple'>
2
34
466


In [23]:
print train_list[0][99]
print train_list[1][99]

preprocessedData/jpg_prep/148435.png
0


In [24]:
where(array(train_list[1]) == 1)[0]

array([  5,   6,  18,  19,  38,  39,  42,  43,  71,  74, 128, 131, 132,
       135, 136, 160, 162, 173, 174, 186, 187, 224, 225, 313, 316, 317,
       318, 321, 324, 325, 337, 338, 483, 484])

In [25]:
train_list[0][5]

'preprocessedData/jpg_prep/100152.png'

In [26]:
print tbl_exam_img_indexed.index.levels[0]
print '='*10
print tbl_exam_img_indexed.index.levels[1]

Int64Index([   20,    98,   836,  1626,  2642,  3642,  5424,  7856,  9156,
            10090, 10916, 12562, 16585, 17459, 20827, 20934, 21107, 21896,
            26102, 26434, 29374, 29504, 29579, 30996, 31175, 33162, 40034,
            42366, 43184, 46211, 46273, 46484, 47041, 50570, 50985, 51286,
            51323, 53538, 53559, 55254, 55578, 56570, 56779, 56960, 57279,
            57571, 58554, 58950, 59265, 64073, 64872, 65228, 65313, 65725,
            66628, 67906, 72548, 76540],
           dtype='int64', name=u'subjectId')
Int64Index([1, 2, 3, 4, 5], dtype='int64', name=u'examIndex')


In [27]:
pat = tbl_exam_img_indexed.loc[[66628, None]]
exams = pat.index.labels[1]
u_exams = np.unique(exams)
print exams
print u_exams

FrozenNDArray([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype='int8')
FrozenNDArray([0, 1, 2], dtype='int8')


In [28]:
pat

Unnamed: 0_level_0,Unnamed: 1_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,imageIndex,view,laterality,filename,cancer
subjectId,examIndex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
66628,1,0,0,0,0,0,52,0,.,0,.,...,0,0,0,34.9984,1,1,CC,R,666765.dcm,0
66628,1,0,0,0,0,0,52,0,.,0,.,...,0,0,0,34.9984,1,2,CC,L,666768.dcm,0
66628,1,0,0,0,0,0,52,0,.,0,.,...,0,0,0,34.9984,1,3,MLO,L,666771.dcm,0
66628,1,0,0,0,0,0,52,0,.,0,.,...,0,0,0,34.9984,1,4,MLO,R,666774.dcm,0
66628,2,737,0,0,0,0,54,.,.,0,.,...,0,0,0,33.2263,1,1,CC,R,666780.dcm,0
66628,2,737,0,0,0,0,54,.,.,0,.,...,0,0,0,33.2263,1,2,CC,L,666784.dcm,0
66628,2,737,0,0,0,0,54,.,.,0,.,...,0,0,0,33.2263,1,3,MLO,R,666787.dcm,0
66628,2,737,0,0,0,0,54,.,.,0,.,...,0,0,0,33.2263,1,4,MLO,L,666790.dcm,0
66628,3,983,0,0,0,0,57,.,.,0,.,...,0,0,0,27.7624,1,1,CC,R,666828.dcm,0
66628,3,983,0,0,0,0,57,.,.,0,.,...,0,0,0,27.7624,1,2,CC,L,666832.dcm,0


In [29]:
n_exam = len(np.unique(pat.index.labels[1]))
print n_exam

3


In [30]:
subj_list = tbl_exam_img_indexed.index.levels[0]
for subj_id in subj_list:
    subj = tbl_exam_img_indexed.loc[[subj_id, None]]
    n_exam = len(np.unique(subj.index.labels[1]))
    print 'ID: %s, exams: %d' % (str(subj_id), n_exam)

ID: 20, exams: 1
ID: 98, exams: 1
ID: 836, exams: 2
ID: 1626, exams: 2
ID: 2642, exams: 2
ID: 3642, exams: 1
ID: 5424, exams: 1
ID: 7856, exams: 3
ID: 9156, exams: 2
ID: 10090, exams: 1
ID: 10916, exams: 1
ID: 12562, exams: 2
ID: 16585, exams: 4
ID: 17459, exams: 1
ID: 20827, exams: 1
ID: 20934, exams: 3
ID: 21107, exams: 2
ID: 21896, exams: 1
ID: 26102, exams: 5
ID: 26434, exams: 2
ID: 29374, exams: 2
ID: 29504, exams: 2
ID: 29579, exams: 2
ID: 30996, exams: 3
ID: 31175, exams: 1
ID: 33162, exams: 1
ID: 40034, exams: 2
ID: 42366, exams: 1
ID: 43184, exams: 3
ID: 46211, exams: 2
ID: 46273, exams: 2
ID: 46484, exams: 3
ID: 47041, exams: 1
ID: 50570, exams: 3
ID: 50985, exams: 1
ID: 51286, exams: 1
ID: 51323, exams: 4
ID: 53538, exams: 1
ID: 53559, exams: 2
ID: 55254, exams: 3
ID: 55578, exams: 1
ID: 56570, exams: 1
ID: 56779, exams: 3
ID: 56960, exams: 1
ID: 57279, exams: 1
ID: 57571, exams: 3
ID: 58554, exams: 1
ID: 58950, exams: 4
ID: 59265, exams: 2
ID: 64073, exams: 1
ID: 64872, exa

In [31]:
index = tbl_exam_img_indexed.loc[26102].index
print index
print np.unique(index)
pat = tbl_exam_img_indexed.loc[26102]
pat.loc[5].iloc[:, :3]

Int64Index([1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5], dtype='int64', name=u'examIndex')
[1 2 3 4 5]


Unnamed: 0_level_0,daysSincePreviousExam,cancerL,cancerR
examIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,370,1,0
5,370,1,0
5,370,1,0
5,370,1,0


In [32]:
#print tbl_exam_img_indexed.index.names
pat = tbl_exam_img_indexed.loc[26102]
print pat.index.unique()
print '='*20
print pat.index

[1 2 3 4 5]
Int64Index([1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5], dtype='int64', name=u'examIndex')


In [33]:
def flatten_exam_generator(exam_img_tbl):
    subj_list = exam_img_tbl.index.levels[0]
    for subj_id in subj_list:
        subj = exam_img_tbl.loc[subj_id]
        for ex_idx in subj.index.unique():
            yield subj.loc[ex_idx]

In [34]:
tmp = flatten_exam_generator(tbl_exam_img_indexed)
print next(tmp).iloc[:, :3]
print '='*20
print next(tmp).iloc[:, :3]
print '='*20
print next(tmp).iloc[:, :3]
print '='*20
print next(tmp).iloc[:, :3]

           daysSincePreviousExam cancerL  cancerR
examIndex                                        
1                              0       0        0
1                              0       0        0
1                              0       0        0
1                              0       0        0
           daysSincePreviousExam cancerL  cancerR
examIndex                                        
1                              0       1        0
1                              0       1        0
1                              0       1        0
1                              0       1        0
           daysSincePreviousExam cancerL  cancerR
examIndex                                        
1                              0       0        0
1                              0       0        0
1                              0       0        0
1                              0       0        0
1                              0       0        0
           daysSincePreviousExam cancerL  cancerR


In [35]:
pat = tmp.next()
pat

Unnamed: 0_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,imageIndex,view,laterality,filename,cancer
examIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,1,0,71,0,.,0,.,...,0,1,0,27.3686,1,1,CC,R,111358.dcm,0
1,0,1,0,1,0,71,0,.,0,.,...,0,1,0,27.3686,1,2,CC,L,111359.dcm,1
1,0,1,0,1,0,71,0,.,0,.,...,0,1,0,27.3686,1,3,MLO,L,111360.dcm,1
1,0,1,0,1,0,71,0,.,0,.,...,0,1,0,27.3686,1,4,MLO,R,111361.dcm,0


In [36]:
pat[['imageIndex', 'view', 'laterality', 'filename']]

Unnamed: 0_level_0,imageIndex,view,laterality,filename
examIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,CC,R,111358.dcm
1,2,CC,L,111359.dcm
1,3,MLO,L,111360.dcm
1,4,MLO,R,111361.dcm


In [37]:
pat_indexed = pat.set_index(['laterality', 'view', 'imageIndex'])
pat_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,reduxHistory,reduxLaterality,hrt,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,filename,cancer
laterality,view,imageIndex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
R,CC,1,0,1,0,1,0,71,0,.,0,.,...,0,.,0,0,1,0,27.3686,1,111358.dcm,0
L,CC,2,0,1,0,1,0,71,0,.,0,.,...,0,.,0,0,1,0,27.3686,1,111359.dcm,1
L,MLO,3,0,1,0,1,0,71,0,.,0,.,...,0,.,0,0,1,0,27.3686,1,111360.dcm,1
R,MLO,4,0,1,0,1,0,71,0,.,0,.,...,0,.,0,0,1,0,27.3686,1,111361.dcm,0


In [38]:
a = pat_indexed['cancerL'][0]
a

'1'

In [39]:
pat_indexed['cancerL'].max()

'1'

In [40]:
pat_indexed.ix['L'].ix['MLO']

Unnamed: 0_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,reduxHistory,reduxLaterality,hrt,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,filename,cancer
imageIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0,1,0,1,0,71,0,.,0,.,...,0,.,0,0,1,0,27.3686,1,111360.dcm,1


In [41]:
images = pat_indexed.ix['R'].ix['CC']
print images.index
print '='*10
print max(images.index)
print '='*10
images

Int64Index([1], dtype='int64', name=u'imageIndex')
1


Unnamed: 0_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,reduxHistory,reduxLaterality,hrt,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,filename,cancer
imageIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,1,0,71,0,.,0,.,...,0,.,0,0,1,0,27.3686,1,111358.dcm,0


In [43]:
images['filename']

imageIndex
1    111358.dcm
Name: filename, dtype: object

In [44]:
tbl_exam_img_indexed.loc[(836, 1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,daysSincePreviousExam,cancerL,cancerR,invL,invR,age,implantEver,implantNow,bcHistory,yearsSincePreviousBc,...,antiestrogen,firstDegreeWithBc,firstDegreeWithBc50,bmi,race,imageIndex,view,laterality,filename,cancer
subjectId,examIndex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,1,CC,R,105545.dcm,0
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,2,CC,L,105546.dcm,0
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,3,MLO,L,105547.dcm,0
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,4,MLO,R,105548.dcm,0
836,1,0,0,0,0,0,61,.,.,0,.,...,9,9,9,.,1,5,CV,R,105549.dcm,0


In [45]:
from meta import DMMetaManager
dm_meta_man = DMMetaManager(exam_tsv='metadata/exams_metadata.tsv', 
                            img_tsv='metadata/images_crosswalk.tsv', 
                            img_folder='preprocessedData/jpg_prep', 
                            img_extension='png')

In [46]:
exam_list = dm_meta_man.get_flatten_exam_list()

In [47]:
exam_list[0][2]['L']['CC']

Unnamed: 0_level_0,filename
imageIndex,Unnamed: 1_level_1
2,preprocessedData/jpg_prep/000136.png


In [48]:
images = exam_list[0][2]['R']['MLO']
idx = images.index.max()
images.loc[idx, 'filename']

'preprocessedData/jpg_prep/000138.png'

In [49]:
def get_n_img(ex, lat, view):
    if ex[2][lat][view] is None:
        return 0
    else:
        return ex[2][lat][view].shape[0]
n_img_R_MLO = np.array(map(lambda e: get_n_img(e, 'R', 'MLO'), exam_list))
n_img_R_MLO

array([1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 2, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [50]:
print n_img_R_MLO.max()
print '='*20
print np.where(n_img_R_MLO == n_img_R_MLO.max())

3
(array([ 6, 77]),)


In [51]:
exam_list[6][2]['R']['MLO']

Unnamed: 0_level_0,filename
imageIndex,Unnamed: 1_level_1
4,preprocessedData/jpg_prep/121381.png
7,preprocessedData/jpg_prep/121390.png
8,preprocessedData/jpg_prep/121393.png


In [52]:
e = exam_list[6]
e

(2642, 1, {'L': {'CC':                                         filename
   imageIndex                                      
   2           preprocessedData/jpg_prep/121373.png
   6           preprocessedData/jpg_prep/121388.png,
   'MLO':                                         filename
   imageIndex                                      
   3           preprocessedData/jpg_prep/121377.png,
   'cancer': '0'},
  'R': {'CC':                                         filename
   imageIndex                                      
   1           preprocessedData/jpg_prep/121370.png
   5           preprocessedData/jpg_prep/121385.png,
   'MLO':                                         filename
   imageIndex                                      
   4           preprocessedData/jpg_prep/121381.png
   7           preprocessedData/jpg_prep/121390.png
   8           preprocessedData/jpg_prep/121393.png,
   'cancer': '0'}})

In [53]:
last_exam_list = dm_meta_man.get_last_exam_list()

In [54]:
print len(exam_list)
print '='*20
print len(last_exam_list)

111
58


In [55]:
last_exam_list[:5]

[(20, 1, {'L': {'CC':                                         filename
    imageIndex                                      
    2           preprocessedData/jpg_prep/000136.png,
    'MLO':                                         filename
    imageIndex                                      
    3           preprocessedData/jpg_prep/000137.png,
    'cancer': '0'},
   'R': {'CC':                                         filename
    imageIndex                                      
    1           preprocessedData/jpg_prep/000135.png,
    'MLO':                                         filename
    imageIndex                                      
    4           preprocessedData/jpg_prep/000138.png,
    'cancer': '0'}}),
 (98, 1, {'L': {'CC':                                         filename
    imageIndex                                      
    2           preprocessedData/jpg_prep/100152.png,
    'MLO':                                         filename
    imageIndex                       

In [62]:
list_of_tup = [ ('a', 0), ('b', 0), ('c', 1) ]
print list_of_tup
tup_of_tup = tuple(zip(*list_of_tup))
print tup_of_tup
lab = tup_of_tup[1]
lab = np.array(lab)
print type(lab)
print lab

[('a', 0), ('b', 0), ('c', 1)]
(('a', 'b', 'c'), (0, 0, 1))
<type 'numpy.ndarray'>
[0 0 1]
