In [None]:
#default_exp data.lists

In [None]:
%load_ext autoreload
%autoreload 2

# Basic list class

> List classes and Transform used extensively in the library

Documentation is here: https://kasparlund.github.io/ai_pytorch/

In [None]:
# export

from collections.abc import *

class ListContainer():
    def __init__(self, items): self.items = items
    def __getitem__(self, idx):
        if isinstance(idx, (int,slice)): return self.items[idx]
        if isinstance(idx[0],bool):
            assert len(idx)==len(self) # bool mask
            return [o for m,o in zip(idx,self.items) if m]
        return [self.items[i] for i in idx]
    def __len__(self): return len(self.items)
    def __iter__(self): return iter(self.items)
    def __setitem__(self, i, o): self.items[i] = o
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self):
        res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
        if len(self)>10: res = res[:-1]+ '...]'
        return res
    
    def new(self, items): return self.__class__(items)
    
    def unique(self): return set(self)

    def split2ways(self,selector):
        true_list  = [i for i,s in zip(self,selector) if s]
        false_list = [i for i,s in zip(self,selector) if not s]
        return self.new(true_list), self.new(false_list)
    
    # parent labels use lambda path: path.parent.name
    # grandparent labels use lambda path: path.parent.parent.name
    def label_by_func(self, f, clsReturned=None): 
        cls = ListContainer if clsReturned==None else clsReturned
        return cls([f(o) for o in self])


In [None]:
# export

def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]

def setify(o): return o if isinstance(o,set) else set(listify(o))

from collections import OrderedDict
def uniqueify(x, sort=False):
    res = list(OrderedDict.fromkeys(x).keys())
    if sort: res.sort()
    return res


In [None]:
# export
import random

#basic structure that are referencede many places
class Transform:
    #applies a transform to source
    def __call__( self, source ): return source

class Transforms(Transform):
    #provide the transforms as a list
    def __init__( self, transforms, shuffle=False):
        self.transforms, self.shuffle = listify(transforms), shuffle
        
    #applies a transform to source
    def __call__( self, source ):
        if self.shuffle : 
            random.shuffle(self.transforms)
        for tfm in self.transforms:
            source = tfm( source )
        return source

class DataBunch():
    def __init__(self, train_dl, valid_dl, c_in, c_out):
        self.train_dl,self.valid_dl,self.c_in,self.c_out = train_dl,valid_dl,c_in,c_out

    @property
    def train_ds(self): return self.train_dl.dataset

    @property
    def valid_ds(self): return self.valid_dl.dataset

class Callback():
    @property
    def name(self):
        name = re.sub(r'Callback$', '', self.__class__.__name__)
        return camel2snake(name or 'callback')
    

In [None]:
# export
from pathlib import Path

"""
We use the `ListContainer` to store our objects in an `ItemList`. 
The `get` method will need to be subclassed to explain how to access an element 
(open an image for instance), then the private `_get` method can allow us to apply any 
additional transform to it.
`new` will be used in conjunction with `__getitem__` (that works for one index or a list of indices) 
to create training and validation set from a single stream when we split the data.

Transforms only need to be functions that take an element of the ItemList and transform it. 
If they need state, they can be defined as a class. Also, having them as a class allows 
to define an _order attribute (default 0) that is used to sort the transforms.

"""

class ItemList(ListContainer):
    def __init__(self, items, path='.', tfm=Transform()):
        super().__init__(items)
        self.path, self.tfm = Path(path), tfm

    def __repr__(self): return f'{super().__repr__()}\nPath: {self.path}'
    def new(self, items): return self.__class__(items, self.path, self.tfm)

    def  get(self, i): return i
    def _get(self, i): return self.tfm( self.get(i) )
    
    def __getitem__(self, idx):
        res = super().__getitem__(idx)
        if isinstance(res,list): return [self._get(o) for o in res]
        else:                    return self._get(res)
        
        

In [None]:
# export
import os
class FileList(ItemList):

    @staticmethod
    def _get_files(p, fs, extensions=None):
        p = Path(p)
        res = [p/f for f in fs if not f.startswith('.')
               and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
        return res
                
    @staticmethod
    def get_files(path, extensions=None, recurse=False, include=None):
        path       = Path(path)
        extensions = setify(extensions)
        extensions = {e.lower() for e in extensions}
        if recurse:
            res = []
            for i,(p,d,f) in enumerate(os.walk(path)): # returns (dirpath, dirnames, filenames)
                if include is not None and i==0: d[:] = [o for o in d if o in include]
                else:                            d[:] = [o for o in d if not o.startswith('.')]
                res += FileList._get_files(p, f, extensions)
            return res
        else:
            f = [o.name for o in os.scandir(path) if o.is_file()]
            return FileList._get_files(path, f, extensions)

    @classmethod
    def from_files(cls, path, extensions, recurse=True, include=None, tfm=Transform()):
        files = FileList.get_files(path, extensions, recurse=recurse, include=include)
        return cls(files, path=path, tfm=tfm)


# Test

In [None]:
from lib.core import *

import mimetypes
import numpy as np
import json
from lib.data.external import *
path = untar_data(URLs.MNIST)

imageExtensions = set(k for k,v in mimetypes.types_map.items() if v.startswith('image/'))
files  = FileList.from_files( path, imageExtensions )
labels = files.label_by_func( lambda path: path.parent.name )
ibx_training = files.label_by_func( lambda path: path.parent.parent.name=="training" )

print( f"nb files: {len(files)} nb labels: {len(labels)} nb b_idx_training: {len(ibx_training)}"\
       +f" unique labels:{len(labels.unique())}" )

sum(ibx_training)
files[0]

nb files: 70000 nb labels: 70000 nb b_idx_training: 70000 unique labels:10


PosixPath('/Users/kasparlund/.fastai/data/mnist_png/training/9/36655.png')

In [None]:
train,test = files.split2ways(ibx_training)
type(files), type(train), type(test), len(files), len(train), len(test)

(__main__.FileList, __main__.FileList, __main__.FileList, 70000, 60000, 10000)

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 00_test.ipynb.
Converted 01_data.external.ipynb.
Converted 02_lists.ipynb.
Converted 03_images-back 31-juni-2021.ipynb.
Converted 03_images.ipynb.
Converted 04_databunchs_undone.ipynb.
Converted 05_Learner.ipynb.
Converted 05_model.ipynb.
Converted 06_modelmanger.ipynb.
Converted 07_optimizers.ipynb.
Converted app_image_01_imagenette_optimizers.ipynb.
Converted app_image_01_mnist_optimizers.ipynb.
Converted augmentation_cpu.ipynb.
Converted data_block.ipynb.
Converted index.ipynb.
Converted mnist_experiments.ipynb.
Converted mnist_initi_batchnorm.ipynb.
Converted transfer_learning.ipynb.
