In [6]:
import pandas as pd
import seaborn as sns

# Data analysis

## Data description

Ocular Disease Intelligent Recognition (ODIR) is a structured ophthalmic database of 5,000 patients with age, color fundus photographs from left and right eyes and doctors' diagnostic keywords from doctors.

This dataset is meant to represent ‘‘real-life’’ set of patient information collected by Shanggong Medical Technology Co., Ltd. from different hospitals/medical centers in China. In these institutions, fundus images are captured by various cameras in the market, such as Canon, Zeiss and Kowa, resulting into varied image resolutions.
Annotations were labeled by trained human readers with quality control management. They classify patient into eight labels including:

Normal (N),
Diabetes (D),
Glaucoma (G),
Cataract (C),
Age related Macular Degeneration (A),
Hypertension (H),
Pathological Myopia (M),
Other diseases/abnormalities (O)

## Import Data

In [3]:
df = pd.read_csv("../raw_data/full_df.csv")

In [15]:
df.shape

(6392, 19)

In [31]:
df.tail()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
6387,4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4686_left.jpg
6388,4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4688_left.jpg
6389,4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4689_left.jpg
6390,4690,57,Male,4690_left.jpg,4690_right.jpg,mild nonproliferative retinopathy,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4690_left.jpg
6391,4784,58,Male,4784_left.jpg,4784_right.jpg,hypertensive retinopathy，age-related macular d...,hypertensive retinopathy，age-related macular d...,0,0,0,0,1,1,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['H'],"[0, 0, 0, 0, 0, 1, 0, 0]",4784_left.jpg


In [4]:
df.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


In [5]:
df.columns

Index(['ID', 'Patient Age', 'Patient Sex', 'Left-Fundus', 'Right-Fundus',
       'Left-Diagnostic Keywords', 'Right-Diagnostic Keywords', 'N', 'D', 'G',
       'C', 'A', 'H', 'M', 'O', 'filepath', 'labels', 'target', 'filename'],
      dtype='object')

Do we have more than one file in the column "filename" ?

In [27]:
for i,filename in enumerate(df.filename):
    if len(df.loc[i,['filename']][0]) > 14:
        print(i, filename)

$\rightarrow$ There is not more than 1 filename present in the column. Is this eye responsible of the disease(s) ?

In [89]:
df.isna().sum()

ID                           0
Patient Age                  0
Patient Sex                  0
Left-Fundus                  0
Right-Fundus                 0
Left-Diagnostic Keywords     0
Right-Diagnostic Keywords    0
N                            0
D                            0
G                            0
C                            0
A                            0
H                            0
M                            0
O                            0
filepath                     0
labels                       0
target                       0
filename                     0
dtype: int64

## Frequency of each disease

What is the proportion of each disease ?

In [11]:
df.describe()

Unnamed: 0,ID,Patient Age,N,D,G,C,A,H,M,O
count,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0
mean,2271.150814,57.857947,0.328692,0.332134,0.062109,0.062891,0.049906,0.031758,0.047872,0.248436
std,1417.559018,11.727737,0.469775,0.471016,0.241372,0.242786,0.217768,0.17537,0.213513,0.432139
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,920.75,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2419.5,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3294.0,66.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4784.0,91.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


How many patients have two diseases ?

In [30]:
targets = df[['N','D','G','C','A','H', 'M', 'O']]
perc_two_diseases = len(df[targets.sum(axis=1) > 1])/len(df)
perc_two_diseases

0.15660200250312892

## Separate left and right eyes by keyword

In [57]:
 df['Left-Diagnostic Keywords'].unique()

array(['cataract', 'normal fundus',
       'laser spot，moderate non proliferative retinopathy',
       'macular epiretinal membrane',
       'moderate non proliferative retinopathy', 'drusen',
       'epiretinal membrane',
       'moderate non proliferative retinopathy，hypertensive retinopathy',
       'pathological myopia', 'mild nonproliferative retinopathy',
       'hypertensive retinopathy',
       'macular epiretinal membrane，mild nonproliferative retinopathy',
       'macular hole', 'wet age-related macular degeneration',
       'moderate non proliferative retinopathy，laser spot',
       'moderate non proliferative retinopathy，myelinated nerve fibers',
       'dry age-related macular degeneration',
       'epiretinal membrane，myelinated nerve fibers',
       'diabetic retinopathy', 'epiretinal membrane，lens dust', 'atrophy',
       'laser spot，white vessel，moderate non proliferative retinopathy',
       'chorioretinal atrophy',
       'moderate non proliferative retinopathy，catar

### Normal

In [125]:
boollist_left = []
boollist_right = []
for keyword_left, keyword_right in zip(df['Left-Diagnostic Keywords'],df['Right-Diagnostic Keywords']) :
    boollist_left.append('normal fundus' in keyword_left and not ('epiretinal membrane') in keyword_left)
    boollist_right.append('normal fundus' in keyword_right and not ('epiretinal membrane') in keyword_right)

In [126]:
boollist_tot = [elem1 and elem2 for elem1, elem2 in zip(boollist_left,boollist_right)]

In [131]:
(df[boollist_tot]['N'] == 0).sum()

0

In [133]:
df[[not elem for elem in boollist_tot]][df[[not elem for elem in boollist_tot]]['N'] == 1]

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
2854,4290,51,Male,4290_left.jpg,4290_right.jpg,low image quality,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",4290_right.jpg


In [134]:
boollist_left = []
boollist_right = []
for keyword_left, keyword_right in zip(df['Left-Diagnostic Keywords'],df['Right-Diagnostic Keywords']) :
    boollist_left.append('low image quality' in keyword_left)
    boollist_right.append('low image quality' in keyword_right)

In [137]:
boollist_tot = [elem1 or elem2 for elem1, elem2 in zip(boollist_left,boollist_right)]

In [139]:
len(df[boollist_tot])

18

### Diabetes

In [75]:
boollist_left = []
boollist_right = []
for keyword_left, keyword_right in zip(df['Left-Diagnostic Keywords'],df['Right-Diagnostic Keywords']) :
    boollist_left.append(('proliferative retinopathy' in keyword_left) or ('diabetic' in keyword_left))
    boollist_right.append(('proliferative retinopathy' in keyword_right) or ('diabetic' in keyword_right))

In [78]:
boollist_tot = [elem1 or elem2 for elem1, elem2 in zip(boollist_left,boollist_right)]


In [79]:
(df[boollist_tot]['D'] == 0).sum()

0

In [80]:
(df[[not elem for elem in (boollist_tot)]]['D'] == 1).sum()

0

   ### Glaucoma

In [142]:
(df['G'] == 1).sum()

397

In [143]:
boollist_left = []
boollist_right = []
for keyword_left, keyword_right in zip(df['Left-Diagnostic Keywords'],df['Right-Diagnostic Keywords']) :
    boollist_left.append('glaucoma' in keyword_left)
    boollist_right.append('glaucoma' in keyword_right)

In [144]:
boollist_tot = [elem1 or elem2 for elem1, elem2 in zip(boollist_left,boollist_right)]

In [146]:
(df[boollist_tot]['G'] == 0).sum()

0

In [148]:
len(df[boollist_tot])

397

In [147]:
(df[[not elem for elem in (boollist_tot)]]['G'] == 1).sum()

0

### Cataract

In [149]:
(df['C'] == 1).sum()

402

In [150]:
boollist_left = []
boollist_right = []
for keyword_left, keyword_right in zip(df['Left-Diagnostic Keywords'],df['Right-Diagnostic Keywords']) :
    boollist_left.append('cataract' in keyword_left)
    boollist_right.append('cataract' in keyword_right)

In [151]:
boollist_tot = [elem1 or elem2 for elem1, elem2 in zip(boollist_left,boollist_right)]

In [152]:
(df[boollist_tot]['C'] == 0).sum()

0

In [153]:
len(df[boollist_tot])

402

In [155]:
(df[[not elem for elem in (boollist_tot)]]['C'] == 1).sum()

0

### Other

In [258]:
boollist_left = []
boollist_right = []
for keyword_left, keyword_right in zip(df['Left-Diagnostic Keywords'],df['Right-Diagnostic Keywords']) :
    boollist_left.append('laser' in keyword_left 
                         or 'lens' in keyword_left 
                         or 'macular' in keyword_left 
                         or 'vitreous' in keyword_left 
                         #or 'drusen' in keyword_left 
                         or 'abnormal' in keyword_left
                         or 'epiretinal membrane' in keyword_left
                         or 'myelinated nerve fibers' in keyword_left 
                         or 'refractive' in keyword_left 
                         or 'maculopathy' in keyword_left) 
                         #or 'atroph' in keyword_left 
                         #or 'hemorrhage' in keyword_left 
                         #or 'occlusion' in keyword_left 
                         #or 'pigment' in keyword_left 
                         #or 'spot' in keyword_left)
    boollist_right.append('laser' in keyword_right 
                          or 'lens' in keyword_right 
                          or 'macular' in keyword_right 
                          or 'vitreous' in keyword_right 
                          #or 'drusen' in keyword_right 
                          or 'abnormal' in keyword_right 
                          or 'epiretinal membrane' in keyword_right 
                          or 'myelinated nerve fibers' in keyword_right
                          or 'refractive' in keyword_right
                          or 'maculopathy' in keyword_right)
                          #or 'atroph' in keyword_right   
                          #or 'hemorrhage' in keyword_right
                          #or 'occlusion' in keyword_right 
                          #or 'pigment' in keyword_right 
                          #or 'spot' in keyword_right )
    

In [259]:
boollist_tot = [elem1 or elem2 for elem1, elem2 in zip(boollist_left,boollist_right)]

In [265]:
import numpy as np

In [266]:
df[~np.array(boollist_tot)][df[~np.array(boollist_tot)]['O'] == 1]

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
6,7,60,Female,7_left.jpg,7_right.jpg,drusen,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",7_right.jpg
15,17,57,Male,17_left.jpg,17_right.jpg,drusen,drusen,0,0,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['O'],"[0, 0, 0, 0, 0, 0, 0, 1]",17_right.jpg
28,34,61,Male,34_left.jpg,34_right.jpg,drusen,drusen,0,0,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['O'],"[0, 0, 0, 0, 0, 0, 0, 1]",34_right.jpg
54,66,62,Male,66_left.jpg,66_right.jpg,atrophy,normal fundus,0,0,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",66_right.jpg
61,75,68,Male,75_left.jpg,75_right.jpg,drusen,drusen,0,0,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['O'],"[0, 0, 0, 0, 0, 0, 0, 1]",75_right.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4477,1731,72,Female,1731_left.jpg,1731_right.jpg,pathological myopia,moderate non proliferative retinopathy，tessell...,0,1,0,0,0,0,1,1,../input/ocular-disease-recognition-odir5k/ODI...,['M'],"[0, 0, 0, 0, 0, 0, 1, 0]",1731_left.jpg
4551,2048,63,Male,2048_left.jpg,2048_right.jpg,hypertensive retinopathy,glaucoma，old central retinal vein occlusion,0,0,1,0,0,1,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['H'],"[0, 0, 0, 0, 0, 1, 0, 0]",2048_left.jpg
4608,2145,73,Female,2145_left.jpg,2145_right.jpg,cataract,drusen,0,0,0,1,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['C'],"[0, 0, 0, 1, 0, 0, 0, 0]",2145_left.jpg
4647,2185,72,Female,2185_left.jpg,2185_right.jpg,cataract,drusen,0,0,0,1,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['C'],"[0, 0, 0, 1, 0, 0, 0, 0]",2185_left.jpg


### Labelling

In [289]:
labels_left = [[] for i in df['Left-Diagnostic Keywords']]
for i,keyword in enumerate(df['Left-Diagnostic Keywords']):
    labels_left[i] = ''
    if 'normal fundus' in keyword and not 'epiretinal membrane' in keyword:
        labels_left[i] = labels_left[i] + 'N'
    if 'cataract' in keyword:
        labels_left[i] = labels_left[i] +'C'
    if 'proliferative retinopathy' in keyword:
        labels_left[i] = labels_left[i] + 'D'
    if 'diabetic' in keyword:
        labels_left[i] = labels_left[i] + 'D'
    if 'glaucoma' in keyword:
        labels_left[i] = labels_left[i] + 'G'
    if 'hypertensive' in keyword: #See Joao's notebook
        labels_left[i] = labels_left[i] + 'H'
    if 'myopia' in keyword: #See Joao's notebook
        labels_left[i] = labels_left[i] + 'M'
    if 'myopic' in keyword:
        labels_left[i] = labels_left[i] + 'M'
    if 'age' in keyword:
        labels_left[i] = labels_left[i] + 'A'
    if labels_left[i] == '':
        labels_left[i] = labels_left[i] + 'O'
        

In [290]:
labels_right = [[] for i in df['Right-Diagnostic Keywords']]
for i,keyword in enumerate(df['Right-Diagnostic Keywords']):
    labels_right[i] = ''
    if 'normal fundus' in keyword and not 'epiretinal membrane' in keyword:
        labels_right[i] = labels_right[i] + 'N'
    if 'cataract' in keyword:
        labels_right[i] = labels_right[i] +'C'
    if 'proliferative retinopathy' in keyword:
        labels_right[i] = labels_right[i] + 'D'
    if 'diabetic' in keyword:
        labels_right[i] = labels_right[i] + 'D'
    if 'glaucoma' in keyword:
        labels_right[i] = labels_right[i] + 'G'
    if 'hypertensive' in keyword: #See Joao's notebook
        labels_right[i] = labels_right[i] + 'H'
    if 'myopia' in keyword: #See Joao's notebook
        labels_right[i] = labels_right[i] + 'M'
    if 'myopic' in keyword:
        labels_right[i] = labels_right[i] + 'M'
    if 'age' in keyword:
        labels_right[i] = labels_right[i] + 'A'
    if labels_right[i] == '':
        labels_right[i] = labels_right[i] + 'O'

In [291]:
df['labels_right'] = labels_right
df['labels_left'] = labels_left

In [297]:
(np.array([len(label) for label in df.labels_left]) > 1).sum()

176