In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
!spip3 install cloudstorage

Defaulting to user installation because normal site-packages is not writeable


In [11]:
import cloudstorage as gcs

ModuleNotFoundError: No module named 'cloudstorage'

## 1 - debugging of splitting

So we encountered a strange bug. When we split our dataframe on different VMs we get different results. How is it possible?

In [4]:
df = pd.read_csv('df.csv')
df2 = pd.read_csv('df2.csv')

In [6]:
df.shape, df2.shape

((112120, 7), (112120, 7))

In [8]:
df.columns

Index(['Unnamed: 0', 'file_path', 'patient_id', 'labels', 'labels_array',
       'label_pneumonia', 'split'],
      dtype='object')

So split is different. Let's try to look at patients.

In [9]:
df.split.value_counts()

train    89600
val      11379
test     11141
Name: split, dtype: int64

In [10]:
df2.split.value_counts()

train    90451
test     11069
val      10600
Name: split, dtype: int64

So patients are the same but in different order.

In [17]:
np.all(df.patient_id.unique() == df2.patient_id.unique())

False

In [16]:
np.all(sorted(list(df.patient_id.unique())) == sorted(list(df2.patient_id.unique())))

True

In [13]:
len(df.patient_id.unique()), len(df2.patient_id.unique())

(30805, 30805)

But what about patients in train split? They are completely different!

In [20]:
df_train = df[df.split == 'train']
df2_train = df2[df2.split == 'train']
df_train.shape, df2_train.shape

((89600, 7), (90451, 7))

In [25]:
np.all(sorted(list(df_train.patient_id.unique())) == sorted(list(df2_train.patient_id.unique())))

False

In [27]:
patients_train = set(df_train.patient_id.unique())
patients2_train = set(df2_train.patient_id.unique())
len(patients_train), len(patients2_train)

(24644, 24644)

In [28]:
np.all(sorted(list(patients_train)) == sorted(list(patients2_train)))

False

In [30]:
len(patients_train.difference(patients2_train))

4914

In [31]:
sorted(list(patients_train))[:10]

[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [32]:
sorted(list(patients2_train))[:10]

[1, 2, 3, 5, 6, 7, 8, 10, 13, 14]

So bug is in **splitting patients**! Despite we set `random state`!

## 2 - prepare split of files to use

So we need to prepare one split to use.

### 1 - split patients

In [34]:
df = pd.read_csv('Data_Entry_2017.csv')

In [35]:
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [36]:
patients = df['Patient ID'].unique()

In [37]:
len(patients)

30805

In [40]:
class Params:

    def __init__(self):
        # hyper-parameters
        self.LEARNING_RATE = 0.001
        self.BATCH_SIZE = 32
        self.EPOCHS = 5

        # parameters for dataset
        self.CLASSES = [
            'Atelectasis',
            'Cardiomegaly',
            'Consolidation',
            'Edema',
            'Effusion',
            'Emphysema',
            'Fibrosis',
            'Hernia',
            'Infiltration',
            'Mass',
            # original label 'No Finding'
            # we don't include it in our labels
            # 'No_Finding',
            'Nodule',
            'Pleural_Thickening',
            'Pneumonia',
            'Pneumothorax'
        ]
        self.NO_FINDING = 'No_Finding'
        self.PNEUMONIA = 'Pneumonia'
        self.NOT_PNEUMONIA = 'not ' + self.PNEUMONIA
        self.SHUFFLE_BUFFER = 2048
        self.READER_SHUFFLE_BUFFER_TRAIN = 3000
        self.READER_SHUFFLE_BUFFER_VAL = 1000

        # down sampling params
        self.DOWN_SAMPLING_RATIO = 1

        # parameters for the model
        self.IMAGE_SIZE = [224, 224]
        self.CHANNELS = 3
        self.WEIGHTS = 'imagenet'
        self.N_CLASSES_PNEUMONIA = 1
        self.N_CLASSES = len(self.CLASSES)
        self.ACTIVATION = 'sigmoid'

        # parameters for the trainer
        self.LOSS = 'binary_crossentropy'
        self.TRAIN_SPLIT = .7
        self.VAL_SPLIT = .2
        self.TEST_SPLIT = .1

        # CGS bucket to store history and weights
        self.PROJECT_ID = 'dl-projects-2020'
        self.BUCKET_NAME = 'dl-projects-2020-bucket-1'
        self.CHEXNET_BLOB = 'chexnet'

        self.SEED = 42

    def to_json(self):
        return json.dumps(self,
                          default=lambda o: o.__dict__,
                          sort_keys=True,
                          indent=4)


params = Params()

In [42]:
params.SEED, params.TRAIN_SPLIT

(42, 0.7)

In [45]:
patients_train, patients_val = train_test_split(list(patients),
                                                train_size=params.TRAIN_SPLIT,
                                                test_size=1-params.TRAIN_SPLIT,
                                                random_state=params.SEED)
patients_val, patients_test = train_test_split(patients_val,
                                               train_size=.7,
                                               test_size=.3,
                                               random_state=params.SEED)

In [46]:
len(patients_train), len(patients_val), len(patients_test)

(21563, 6469, 2773)

In [52]:
patients_train_set, patients_val_set, patients_test_set = \
set(patients_train), set(patients_val), set(patients_test)

### 2 - split files

In [73]:
file_names = pd.read_csv('df_files.csv')

In [74]:
file_names.head()

Unnamed: 0,file_path
0,/kaggle/input/data/images_006/images/00012162_...
1,/kaggle/input/data/images_006/images/00013655_...
2,/kaggle/input/data/images_006/images/00012911_...
3,/kaggle/input/data/images_006/images/00012987_...
4,/kaggle/input/data/images_006/images/00011837_...


In [67]:
def get_patient(file_path):
    patient_str = file_path.split('/')[-1][:-8]
#     print(patient_str)
    return int(patient_str)

In [66]:
fp = file_names.loc[0, 'file_path']
fp, get_patient(fp)

00012162


('/kaggle/input/data/images_006/images/00012162_001.png', 12162)

In [75]:
file_names['patient_id'] = file_names['file_path'].apply(get_patient)

In [76]:
file_names.head()

Unnamed: 0,file_path,patient_id
0,/kaggle/input/data/images_006/images/00012162_...,12162
1,/kaggle/input/data/images_006/images/00013655_...,13655
2,/kaggle/input/data/images_006/images/00012911_...,12911
3,/kaggle/input/data/images_006/images/00012987_...,12987
4,/kaggle/input/data/images_006/images/00011837_...,11837


In [77]:
len(file_names.patient_id.unique())

30805

In [79]:
def get_split(patient_id):
    if patient_id in patients_train_set:
        return 'train'
    elif patient_id in patients_val_set:
        return 'val'
    elif patient_id in patients_test_set:
        return 'test'
    else:
        raise ValueError('wrong patient_id')

In [81]:
file_names['split'] = file_names['patient_id'].apply(get_split).astype('string')

In [82]:
file_names.split.value_counts()

train    78566
val      23800
test      9754
Name: split, dtype: Int64

In [83]:
file_names.head()

Unnamed: 0,file_path,patient_id,split
0,/kaggle/input/data/images_006/images/00012162_...,12162,train
1,/kaggle/input/data/images_006/images/00013655_...,13655,train
2,/kaggle/input/data/images_006/images/00012911_...,12911,train
3,/kaggle/input/data/images_006/images/00012987_...,12987,test
4,/kaggle/input/data/images_006/images/00011837_...,11837,val


In [87]:
len(file_names[file_names.split == 'train'].patient_id.unique()), len(patients_train)

(21563, 21563)

In [88]:
len(file_names[file_names.split == 'val'].patient_id.unique()), len(patients_val)

(6469, 6469)

In [89]:
len(file_names[file_names.split == 'test'].patient_id.unique()), len(patients_test)

(2773, 2773)

In [91]:
file_names.to_csv('chexnet_split.csv', index=False)

## 3 - reading files from GCS

In [2]:
filename = 'gs://dl-projects-2020-bucket-1/chexnet/01-pneumonia-downsampling-3/params.json'
with gcs.open(filename) as gcs_file:
    contents = gcs_file.read()
print(type(contents))

NameError: name 'gcs' is not defined