In [35]:
import pandas as pd
from path import Path
import os
import shutil
import fastai
from fastai.vision import *
from fastai.metrics import fbeta, Precision, Recall, accuracy_thresh
from sklearn.metrics import roc_auc_score, recall_score

In [36]:
path_raw_data = Path('../../data/full/')
files = os.listdir(path_raw_data)

path_data = path_raw_data/'images'

img_files = os.listdir(path_data)

df_labels = pd.concat([
    pd.read_csv("../model_data/train.csv").assign(subset='train'), 
    pd.read_csv("../model_data/dev.csv").assign(subset='valid'),
    pd.read_csv("../model_data/test.csv").assign(subset='test')
                      ],
                     axis=0)

df_labels = df_labels.loc[df_labels['Image Index'].isin(img_files)]

In [38]:
df_labels = df_labels.groupby(['Finding Labels']).apply(lambda x: x.sample(1)).reset_index(drop=True)
df_labels.loc[df_labels['Finding Labels'] == 'No Finding', 'Finding Labels'] = ''
df_labels

Unnamed: 0,Image Index,Patient ID,Finding Labels,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia,subset
0,00014248_000.png,14248,Atelectasis,1,0,0,0,0,0,0,0,0,0,0,0,0,0,train
1,00029940_006.png,29940,Atelectasis|Cardiomegaly,1,1,0,0,0,0,0,0,0,0,0,0,0,0,train
2,00027415_000.png,27415,Atelectasis|Cardiomegaly|Consolidation,1,1,0,0,0,0,0,0,1,0,0,0,0,0,train
3,00012364_006.png,12364,Atelectasis|Cardiomegaly|Consolidation|Edema,1,1,0,0,0,0,0,0,1,1,0,0,0,0,train
4,00015304_000.png,15304,Atelectasis|Cardiomegaly|Consolidation|Edema|E...,1,1,1,1,0,0,0,0,1,1,0,0,0,0,train
5,00017138_036.png,17138,Atelectasis|Cardiomegaly|Consolidation|Edema|E...,1,1,1,1,1,0,0,0,1,1,0,0,0,0,train
6,00019924_031.png,19924,Atelectasis|Cardiomegaly|Consolidation|Edema|E...,1,1,1,1,1,1,0,0,1,1,0,0,0,0,train
7,00019508_013.png,19508,Atelectasis|Cardiomegaly|Consolidation|Edema|E...,1,1,1,0,1,0,0,0,1,1,0,0,0,0,train
8,00011702_039.png,11702,Atelectasis|Cardiomegaly|Consolidation|Edema|E...,1,1,1,0,0,0,0,0,1,1,0,0,1,0,train
9,00015401_022.png,15401,Atelectasis|Cardiomegaly|Consolidation|Effusion,1,1,1,0,0,0,0,0,1,0,0,0,0,0,train


In [39]:
tfms = get_transforms(do_flip=True, 
                      flip_vert=False, 
                      max_rotate=5,
                      max_zoom=1.03,
                      max_lighting=0.03,
                      max_warp=0.03,
                      p_affine=0.05,
                      p_lighting=0.05,
                      )

basic_block = (ImageList
                .from_df(df_labels.query('subset!="test"'),
                         path=path_data)
                .split_by_files(valid_names=df_labels.query('subset=="valid"')['Image Index'].tolist())
                .label_from_df(label_delim='|', cols='Finding Labels')
              )

db_xrays = (basic_block
            .transform(tfms, 
                       size=64) 
            .databunch(bs=32))

db_xrays.normalize(imagenet_stats)

db_xrays.c

14

In [40]:
def auroc_score(input_, target):
    input_, target = input_.cpu().numpy(), target.cpu().numpy() #.reshape(-1, learn.data.c)
    score = []
    for i in range(14):
        try: 
            score.append(roc_auc_score(target[:,i], input_[:,i]))
        except ValueError:
            score.append(0)
    
    return np.array(score).mean()

#     def __init__(self):
#         pass

class AUROC(Callback):
    _order = -20 #Needs to run before the recorder
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []
    
    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)
                
    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            metric = auroc_score(output, target)
            return add_metrics(last_metrics, metric)



In [41]:
from functools import partial

accuracy_20 = partial(accuracy_thresh, thresh=0.2)
accuracy_50 = partial(accuracy_thresh, thresh=0.5)
f2_20 = partial(fbeta, thresh=0.2, beta=2)
f2_50 = partial(fbeta, thresh=0.5, beta=2)
f1_20 = partial(fbeta, thresh=0.2, beta=1)
f1_50 = partial(fbeta, thresh=0.5, beta=1)


model = cnn_learner(data=db_xrays, 
                   base_arch=models.densenet121,
                   metrics=[AUROC(), 
                            fbeta, 
                            accuracy_20, accuracy_50, 
                            f1_20, f1_50, f2_20, f2_50]
                   )


model.fit_one_cycle(cyc_len=2, max_lr=5e-2)

epoch,train_loss,valid_loss,auroc,fbeta,accuracy_thresh,accuracy_thresh.1,fbeta.1,fbeta.2,fbeta.3,fbeta.4,time
0,0.718671,3.719157,0.390823,0.440839,0.522556,0.531955,0.377762,0.330787,0.440839,0.368482,00:07
1,0.658354,0.615682,0.482431,0.576757,0.586466,0.684211,0.480716,0.134252,0.576757,0.106931,00:07


In [45]:
os.mkdir('../model_data/sample/')
os.mkdir('../model_data/sample/images')

In [46]:
for i in df_labels['Image Index'].tolist():
    shutil.copy(path_data/i, f'../model_data/sample/images/{i}')