# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
!unzip -q "/content/drive/MyDrive/isic-2019.zip" #https://www.kaggle.com/datasets/andrewmvd/isic-2019

Mounted at /content/drive


In [2]:
import os
#os.system('pip install .')
os.system('pip install git+https://github.com/hamish-haggerty/base_rbt.git')
os.system('pip install git+https://github.com/hamish-haggerty/cancer-proj.git')
!pip install -qU git+https://github.com/hamish-haggerty/cancer-proj.git #sometimes os doesn't work so use this if imports below fail

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
#| export
from fastai.vision.all import *
from base_rbt.all import *
#TODO: wrap this in an .all
from cancer_proj.cancer_dataloading import *
from cancer_proj.cancer_metrics import *
from cancer_proj.cancer_maintrain import *

from self_supervised.augmentations import assert_aug_pipelines
from self_supervised.layers import create_mlp_module
from statistics import mean,stdev

import fastai
test_eq(fastai.__version__,'2.7.11')

import torch
test_eq(torch.__version__,'1.13.1+cu116')


#Get data paths:

In [4]:
save_directory = '/content/drive/My Drive/cancer_colab' #directory for saving models etc
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

In [5]:
directory = "/content/drive/MyDrive/ISIC_2019_Training_Input/"
data = pd.read_csv("/content/drive/MyDrive/ISIC_2019_Training_GroundTruth.csv").drop("UNK", axis=1)
data = data[~data["image"].str.contains("downsampled")]
labels = pd.read_csv("/content/drive/MyDrive/ISIC_2019_Training_GroundTruth.csv")

data_dict = load_dict_from_gdrive(directory=save_directory,filename='data_dict') 
_fnames = data_dict['_fnames']
#_fnames = get_image_files(directory) 
#_fnames = [name for name in _fnames if 'downsampled' not in name.as_posix()] #otherwise load like this
test_eq(len(_fnames),len(data))

label_func_dict = data_dict['label_func_dict']

def label_func(name):
    return label_func_dict[name]
_labels = [label_func(i) for i in _fnames]

# Build training, tuning, validation test sets:
    - Training is unlabelled
    - Tuning is for supervised fine tuning
    - Validation is a held out (proxy) test set
    - Test set is for getting results

In [6]:
#tests / sanity checks:
test_eq(process_path(_fnames[0]),'ISIC_0071718.jpg')
test_eq(process_path(_fnames[10]),'ISIC_0071719.jpg')

_fnames_dict = get_fnames(_fnames,_labels,label_func)
fnames_train,fnames_valid,fnames_test = _fnames_dict['fnames_train'],_fnames_dict['fnames_valid'],_fnames_dict['fnames_test'] 
labels_train,labels_valid,labels_test = _fnames_dict['labels_train'],_fnames_dict['labels_valid'],_fnames_dict['labels_test'] 

print(f'Training (tuning) set has: \n{Counter(labels_train)}\n')

print(f'Validation set has: \n{Counter(labels_valid)}\n')

print(f'Test set has: \n{Counter(labels_test)}\n')

#A few tests: Make sure fnames_train and fnames_test the same every time
test_eq(process_path(fnames_train[44]),'ISIC_0071754.jpg')
test_eq(process_path(fnames_test[10]),'ISIC_0000011.jpg')

#Make sure training and valid are disjoint
for path in fnames_valid: assert path not in fnames_train #check that valid set is disjoint from training (tuning) set

#Make sure test and valid+train are disjoint
for path in fnames_test: assert path not in fnames_train+fnames_valid #check that test set is disticnt from training and validation set

Training (tuning) set has: 
Counter({'NV': 500, 'MEL': 500, 'BCC': 500, 'BKL': 467, 'AK': 306, 'SCC': 171, 'VASC': 55, 'DF': 55})

Validation set has: 
Counter({'NV': 458, 'MEL': 309, 'BCC': 274, 'BKL': 110, 'AK': 63, 'SCC': 43, 'VASC': 12, 'DF': 11})

Test set has: 
Counter({'NV': 10601, 'MEL': 3339, 'BCC': 2549, 'BKL': 1663, 'AK': 498, 'SCC': 414, 'VASC': 186, 'DF': 173})



# Setup dataloaders

In [None]:
device ='cuda' if torch.cuda.is_available() else 'cpu'

size=256
item_tfms = [Resize(size)]

item_tfms_train = [Resize(128)]

dls_tune  = ImageDataLoaders.from_path_func(directory, fnames_train, label_func,
                                bs=64,
                                item_tfms=item_tfms,
                                valid_pct=0,
                                device=device,
                                num_workers=12*(device=='cuda')
                                             )


dls_valid  = ImageDataLoaders.from_path_func(directory, fnames_valid, label_func,
                                bs=256,
                                item_tfms=item_tfms,
                                valid_pct=0,
                                device=device,
                                num_workers=12*(device=='cuda')
                                             )

#This is for training BT (so viewed as unlabelled)
dls_train  = ImageDataLoaders.from_path_func(directory, fnames_train, label_func,
                                bs=256,
                                item_tfms=item_tfms_train,
                                valid_pct=0,
                                device=device,
                                num_workers=12*(device=='cuda')
                                             )


dls_test =  ImageDataLoaders.from_path_func(directory, fnames_test, label_func,
                                bs=64,
                                item_tfms=item_tfms,
                                valid_pct=0,
                                device=device,
                                num_workers=12*(device=='cuda'),
                                shuffle=False
                                )

classes_to_int={v:i for i,v in enumerate(dls_tune.vocab)}
int_to_classes = {i: v for i, v in enumerate(dls_tune.vocab)}
vocab=dls_tune.vocab

# Debugging / verify that works:

# Aug pipelines

In [None]:
aug_dict = create_aug_pipelines(size=size,device=device,Augs=BYOL_Augs,TUNE_Augs=TUNE_Augs,Val_Augs=Val_Augs)
aug_pipelines = aug_dict['aug_pipelines'] #Heavy augmentation. Use to train BT
aug_pipelines_tune = aug_dict['aug_pipelines_tune'] #Used for fine tuning
aug_pipelines_test = aug_dict['aug_pipelines_test'] #Test time augmentation (generally same as above)

In [None]:
#show_bt_batch(dls_train,n_in=3,n=2,aug=aug_pipelines)

# Let's explore whether pretraining a second time helps performance (pre-pre training)

In [None]:
#Set the training dataloader for BT equal to all the available data
#_fnames_train = fnames_train+fnames_valid
#_fnames_train = [name for name in fnames_train if label_func(name)=='SCC']
#_fnames_train = fnames_train + fnames_valid

_fnames_train=fnames_train+fnames_test

dls_train  = ImageDataLoaders.from_path_func(directory,_fnames_train, label_func,
                                bs=128,
                                item_tfms=item_tfms,
                                valid_pct=0,
                                device=device,
                                num_workers=12*(device=='cuda')
                                             )
#Need to lower size for memory requirements
_aug_dict = create_aug_pipelines(size=256,device=device,Augs=BYOL_Augs,TUNE_Augs=TUNE_Augs,Val_Augs=Val_Augs)
aug_pipelines = aug_dict['aug_pipelines'] #Heavy augmentation. Use to train BT

In [None]:
# bt_model,_ = create_model('bt_pretrain',device)
# learn = Learner(dls_train,bt_model,splitter=my_splitter_bt,cbs=[BarlowTwins(aug_pipelines,n_in=3,lmb=1/8192,print_augs=False)])

# learn.freeze()
# #learn.summary() #You can call this block to verify that the the encoder is being frozen
# learn.fit(10)
# learn.unfreeze()
# lrs = learn.lr_find()
# print(lrs.valley)
# learn.fit_one_cycle(500,lrs.valley)

# Train Barlow Twins and fine tune

In [None]:
def train_bt(freeze_epochs,epochs):
    "Train barlow twins"

    bt_model,_ = create_model('bt_pretrain',device)
    learn = Learner(dls_train,bt_model,splitter=my_splitter_bt,cbs=[BarlowTwins(aug_pipelines,n_in=3,lmb=1/8192,print_augs=False)])

    learn.freeze()
    #learn.summary() #You can call this block to verify that the the encoder is being frozen
    learn.fit(freeze_epochs)
    learn.unfreeze()
    lrs = learn.lr_find()
    print(lrs.valley)
    learn.fit_one_cycle(epochs,lrs.valley)

    return learn

In [8]:
description = 'We pretrained BT initial weights on train+valid. lmb=1/8192,bs=128,resize=128'
initial_weights='bt_prepretrained' #pretrain again, hence `prepre`
epochs=40
tune_model_path = save_directory + f'/initial_weights={initial_weights}'
dict_path = f'{initial_weights}' #file to load dictionary metadata
#########

_runs=range(5)
for i in _runs:

    #Train barlow twins.
    _learn = train_bt(freeze_epochs=1,epochs=100)

    #load results
    _results=None
    if i>0: _results = load_dict_from_gdrive(save_directory,'bt_prepretrained')

    runs=[i]
    #fine tune
    btprpre_results = main_tune(encoder = _learn.model.encoder,
                            initial_weights=initial_weights,epochs=epochs,device=device,dls_tune=dls_tune,dls_test=dls_test,
                            aug_pipelines_tune=aug_pipelines_tune,aug_pipelines_test=aug_pipelines_tune,int_to_classes=int_to_classes,
                            tune_model_path=tune_model_path, dict_path = dict_path,save_directory=save_directory,description=description,
                            results=_results,runs=runs,
                                )

In [7]:
_results = load_dict_from_gdrive(save_directory,'bt_prepretrained')
lst=[]
for k in _results:
    print(k)
    print(_results[k]['acc'])
    lst.append(_results[k]['acc'])

print('\n')
print(mean(lst))
print(stdev(lst))

0
0.7280029058456421
1
0.7199196815490723
2
0.7186840176582336
3
0.7226483821868896
4
0.7207949161529541
5
0.7260979413986206


0.7226913074652354
0.003664878074809444
