## Quick attempt to train a model

In [None]:
import fastai
from fastai import *
from fastai.vision import *
from sklearn.model_selection import train_test_split
import time

verbose = False  # should print out extra details?

%matplotlib inline

In [None]:
bs = 64
num_workers = 0  # Anything greater than zero will get error: DataLoader worker (pid 57) is killed by signal: Bus error
image_size = 224

In [None]:
!ls ../input

In [None]:
data_fp = Path('../input')
data_train = data_fp/'train'
data_test = data_fp/'test'

## Looking at the data

In [None]:
labels = pd.read_csv(data_fp/'train.csv')
print(labels.shape)
print(f'Number of classes: {len(labels.Id.unique()):,}')
labels.head()

## Split Data into training and validation set

In [None]:
class_counts = labels.Id.value_counts(sort=True, ascending=True)
print(f'The number of images: {class_counts.sum():,}')
print('{}'.format('='*20))
print(f'Number of classes with only one image: {sum(class_counts == 1):,}')
print(f'Percentage of classes with one image: {sum(class_counts == 1)/len(labels.Id.unique()):.0%}')
print('{}'.format('='*20))
print(f'Number of new_whate image: {class_counts["new_whale"].sum():,}')
print(f'Percentage of images are new_whale: {class_counts["new_whale"]/class_counts.sum()*100:0.0f}%')

In [None]:
class_counts[::-1][:5]  # top five most common class

## Stratified Split

In [None]:
start_time = time.time()
## stratify sampling that can handle 
train_idx, val_idx = pd.Series(), pd.Series()
for name, group in labels.reset_index()[['index', 'Id']].groupby(['Id']):
    ## if a class only have 1 sample, just return that one
    if group.shape[0] == 1:
        train, val = group['index'], []
    ## split each group randomly and obtain their index
    else:
        train, val = train_test_split(group['index'], test_size=0.2, random_state=None)
    train_idx = train_idx.append(train)
    val_idx = val_idx.append(val)
    
print(f'It took {int(time.time() - start_time)} seconds')

In [None]:
train_idx.head()

In [None]:
## assess that the number of class are all accounted for in the training indexing
assert len(labels.loc[train_idx, 'Id'].unique()) == len(labels.Id.unique())
print('Number of class {}: {}'.format(len(labels.loc[train_idx, 'Id'].unique()), len(labels['Id'].unique())))
print('Percent of training split: {:.0%}'.format(len(train_idx)/labels.shape[0]))

## Create ImageDataBunch

In [None]:
tfms = get_transforms(flip_vert=False, max_zoom=1)  ## remove vertical and zooming
if verbose: tfms  ## list of transformations done to the images. tfms[0] is for training and tfms[1] is for validation

In [None]:
src = (ImageItemList.from_df(path=data_fp, df=labels, cols='Image', folder='train')
                     # images' filepath are in a dataframe with column name 'Image'
                    .split_by_idx(val_idx)
                    # validations are not random and determined by the row indices
                    .label_from_df(cols='Id')
                    # classes for the images are in a dataframe with column name 'Id'
                    .add_test_folder())
                    # images to be use for inferences to the kaggle competition
if verbose: print(f'Type({type(src)})')
if verbose : print(src)  # show a summary of the datasets

In [None]:
def get_data(size, bs, padding_mode='reflection'):
    return (src.transform(tfms, 
                          size=size,
                          resize_method=ResizeMethod.PAD,
                          padding_mode=padding_mode)
                .databunch(bs=bs, num_workers=num_workers)
                # creates a dataloader
                .normalize(imagenet_stats))
                # normalize the whale images with imagenet's mean and std because we are using a pretrained model

In [None]:
data = get_data(image_size, bs, 'border')

In [None]:
# Display examples of the transformation on a single image
def _plot(i,j,ax):
    x,y = data.train_ds[idx]
    x.show(ax,y=y)

idx = np.random.randint(len(data.train_ds))
plot_multi(_plot, 3, 3, figsize=(8,8))  ## show how the image is being transformed

In [None]:
open_image(data.train_ds.items[idx])  ## orginal image

In [None]:
#data.show_batch(rows=2, figsize=(8,8))  # this crashes the kernel

## Training

In [None]:
def mapr(input: torch.Tensor, targs: torch.LongTensor, mapn: int):
    "Compute the mean average precision"
    n = targs.shape[0]  # number for samples
    input = input.argsort(dim=-1, descending=True)[:,:mapn]
    targs = targs.view(n, -1)
    return ((input == targs).float()/torch.arange(1,mapn+1, device=input.device).float()).sum(dim=-1).mean()

map5 = partial(mapr, mapn=5)

In [None]:
learn = create_cnn(data=data, arch=models.resnet50, metrics=[accuracy, map5], model_dir = '/tmp/models')
# make sure your kernel has internet access
# model_dir is needed because it will try to make a models in the input folder which is Read-Only

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.fit(10)

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.recorder.plot_lr(show_moms=True)

In [None]:
learn.unfreeze()
learn.fit(10)

In [None]:
learn.recorder.plot_losses()

In [None]:
pred, _ = learn.get_preds(ds_type=DatasetType.Test)

In [None]:
def create_submission(preds, data, path, name, mapn=5):
    preds_sort = preds.argsort(dim=-1, descending=True)[:,:mapn]
    cls_np = np.asarray(data.classes)
    (pd.DataFrame({"Image": [fn.name for fn in data.test_ds.items],
                  "Id": [" ".join(cls_np[idx]) for idx in preds_sort.numpy()]})
        .to_csv(path/name, index=False))

In [None]:
sub_fp = Path(".")

In [None]:
create_submission(pred, learn.data, sub_fp,'testing2.csv')

In [None]:
pd.read_csv(sub_fp/'testing2.csv').head()

In [None]:
# !df -h  # display compute specs