# Summary Stats on Imagenette

[Imagenette](https://github.com/fastai/imagenette) is a smaller dataset of imagenet that contains 10 easily differentiable classes:   
* tench (n01440764)
* English springer (n02102040)
* cassette player (n02979186)
* chain saw (n03000684)
* church (n03028079)
* French horn (n03394916)
* garbage truck (n03417042)
* gas pump (n03425413)
* golf ball (n03445777)
* parachute (n03888257)

Imagenet Class Idx to human readable names in [json](https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json) 

In [5]:
import fastai
from fastai import *
from fastai.vision import *
from tqdm import tqdm_notebook
from pathlib import Path

# Import the Data

In [2]:
path = untar_data(URLs.IMAGENETTE, dest='data'); path

PosixPath('data/imagenette')

In [6]:
!tree -d

[01;34m.[00m
├── [01;34mdata[00m
│   └── [01;34mimagenette[00m
│       ├── [01;34mtrain[00m
│       │   ├── [01;34mn01440764[00m
│       │   ├── [01;34mn02102040[00m
│       │   ├── [01;34mn02979186[00m
│       │   ├── [01;34mn03000684[00m
│       │   ├── [01;34mn03028079[00m
│       │   ├── [01;34mn03394916[00m
│       │   ├── [01;34mn03417042[00m
│       │   ├── [01;34mn03425413[00m
│       │   ├── [01;34mn03445777[00m
│       │   └── [01;34mn03888257[00m
│       └── [01;34mval[00m
│           ├── [01;34mn01440764[00m
│           ├── [01;34mn02102040[00m
│           ├── [01;34mn02979186[00m
│           ├── [01;34mn03000684[00m
│           ├── [01;34mn03028079[00m
│           ├── [01;34mn03394916[00m
│           ├── [01;34mn03417042[00m
│           ├── [01;34mn03425413[00m
│           ├── [01;34mn03445777[00m
│           └── [01;34mn03888257[00m
└── [01;34mtmp[00m

25 directories


In [3]:
!tree -d {path}

[01;34mdata/imagenette[00m
├── [01;34mtrain[00m
│   ├── [01;34mn01440764[00m
│   ├── [01;34mn02102040[00m
│   ├── [01;34mn02979186[00m
│   ├── [01;34mn03000684[00m
│   ├── [01;34mn03028079[00m
│   ├── [01;34mn03394916[00m
│   ├── [01;34mn03417042[00m
│   ├── [01;34mn03425413[00m
│   ├── [01;34mn03445777[00m
│   └── [01;34mn03888257[00m
└── [01;34mval[00m
    ├── [01;34mn01440764[00m
    ├── [01;34mn02102040[00m
    ├── [01;34mn02979186[00m
    ├── [01;34mn03000684[00m
    ├── [01;34mn03028079[00m
    ├── [01;34mn03394916[00m
    ├── [01;34mn03417042[00m
    ├── [01;34mn03425413[00m
    ├── [01;34mn03445777[00m
    └── [01;34mn03888257[00m

22 directories


In [17]:
def images_per_class(dir_pn):
    '''
        count the number of samples in each class
        expect folder format of 
        dir_pn
        |-- class1
        |    |-- 123.jpg
        |    |-- n39.jpg
        |-- class3
        |    |-- 847.jpg
        |    |-- 3jd.jpb
    '''
    dir_pn = Path(dir_pn)
    cls_folders = [fn.name for fn in dir_pn.glob("*")]
    samples = [len(list((dir_pn/folder).glob("*"))) for folder in cls_folders]
    return cls_folders, samples

In [18]:
tmp = images_per_class(path/'train')

In [19]:
tmp

(['n03028079',
  'n03394916',
  'n03445777',
  'n03425413',
  'n02979186',
  'n03000684',
  'n01440764',
  'n03888257',
  'n02102040',
  'n03417042'],
 [1300, 1300, 1300, 1300, 1300, 1194, 1300, 1300, 1300, 1300])

In [16]:
path/'train'.is_file()

AttributeError: 'str' object has no attribute 'is_file'

In [7]:
test2 = [fn.name for fn in (path/'train').glob("*")]; test2

['n03028079',
 'n03394916',
 'n03445777',
 'n03425413',
 'n02979186',
 'n03000684',
 'n01440764',
 'n03888257',
 'n02102040',
 'n03417042']

In [10]:
samples = [len(list((path/'train'/x).glob("*"))) for x in test2]

In [11]:
samples

[1300, 1300, 1300, 1300, 1300, 1194, 1300, 1300, 1300, 1300]

In [14]:
pd.DataFrame({'class': test2, "train": samples})

Unnamed: 0,class,train
0,n03028079,1300
1,n03394916,1300
2,n03445777,1300
3,n03425413,1300
4,n02979186,1300
5,n03000684,1194
6,n01440764,1300
7,n03888257,1300
8,n02102040,1300
9,n03417042,1300


In [15]:
pd.DataFrame(zip(test2, samples))

Unnamed: 0,0,1
0,n03028079,1300
1,n03394916,1300
2,n03445777,1300
3,n03425413,1300
4,n02979186,1300
5,n03000684,1194
6,n01440764,1300
7,n03888257,1300
8,n02102040,1300
9,n03417042,1300


In [12]:
test2 = [x for x in (path/'train').glob("**/*") if x.is_file()]

12894

In [12]:
len(test2)

1300

In [14]:
np.random.randint(len(test2), 10)

ValueError: Range cannot be empty (low >= high) unless no samples are taken

In [15]:
size = []
for fn in test2:
    size.append(PIL.Image.open(fn).size)

In [16]:
len(size)

1300

In [21]:
tmp = pd.Series(size)

In [22]:
tmp.value_counts()

(500, 375)      263
(500, 333)       61
(400, 300)       44
(640, 480)       40
(240, 180)       24
(500, 334)       23
(375, 500)       23
(200, 150)       16
(600, 450)       14
(800, 600)       13
(500, 374)       12
(300, 225)       12
(500, 377)       11
(1600, 1200)      9
(1024, 768)       9
(500, 281)        8
(275, 206)        7
(448, 336)        6
(320, 240)        6
(333, 500)        5
(500, 332)        5
(431, 287)        5
(480, 320)        4
(260, 195)        4
(350, 263)        4
(2048, 1536)      4
(250, 187)        4
(625, 469)        4
(450, 600)        4
(226, 169)        4
               ... 
(425, 239)        1
(757, 502)        1
(240, 136)        1
(192, 300)        1
(150, 200)        1
(250, 170)        1
(500, 378)        1
(700, 467)        1
(383, 575)        1
(355, 168)        1
(238, 252)        1
(500, 290)        1
(257, 138)        1
(200, 267)        1
(300, 250)        1
(200, 149)        1
(354, 500)        1
(600, 452)        1
(320, 241)        1
