In [1]:
import numpy as np
import pandas as pd
def unpickle(fn):
    with open(fn, 'rb') as fo:
        data = np.load(fn, allow_pickle=True)
    return data['x'], data['y']
    #return x, y

import sys, pathlib, random
from PIL import Image
from matplotlib import cm

import shutil

pathlib.Path('data').mkdir(exist_ok=True)
datadir_recycle = pathlib.Path('data/recycle')
datadir_recycle_small = pathlib.Path('data/recycle-small')

In [2]:
def split_recycle_data(npz_filename):
    #data from http://web.cecs.pdx.edu/~singh/rcyc-web/dataset.html
    d = np.load(npz_filename, allow_pickle=True)
    datadir = datadir_recycle
    datadir_small = datadir_recycle_small

    if datadir.exists() and datadir_small.exists():
        print(f'{datadir} and {datadir_small} already exist, skipping download of recycle dataset')
        return
    
    if datadir.exists(): shutil.rmtree(datadir)
    if datadir_small.exists(): shutil.rmtree(datadir_small)
    
    labels = ['boxes', 'glass_bottles', 'soda_cans', 'crushed_soda_cans', 'water_bottles']
   
    datadir.mkdir(exist_ok=True)
    datadir_small.mkdir(exist_ok=True)
    for dirn in ['train', 'val', 'test']:
        (datadir / dirn).mkdir(exist_ok=True)
        (datadir_small / dirn).mkdir(exist_ok=True)
        for label in labels:
            (datadir/ dirn / str(label)).mkdir(exist_ok=True)
            (datadir_small / dirn / str(label)).mkdir(exist_ok=True)

    print(d.files)
    print('splitting training set')
    for i,x in enumerate(d['x_train']):
        dirn = 'train' if random.random() > 0.2 else 'val'
        y = d['y_train'][i][0]
        if i%100 == 0: print(i,y, label, dirn)

        label = labels[y]
        img = Image.fromarray(x)
        img.save(datadir / dirn / label / f'{i}.jpg')
        if random.random() < 0.22:
            img.save(datadir_small / dirn / label / f'{i}.jpg')

    print('saving test set')
    for i,x in enumerate(d['x_test']):
        dirn = 'test'
        y = d['y_test'][i][0]
        if i%100 == 0: print(i,y, label, dirn)

        label = labels[y]
        img = Image.fromarray(x)
        img.save(datadir / dirn / label / f'{i}.jpg')

        if random.random() < 0.22:
            img.save(datadir_small / dirn / label / f'{i}.jpg')


In [3]:
import urllib.request
import tarfile

url = 'http://web.cecs.pdx.edu/~singh/rcyc-web/recycle_data_shuffled.tar.gz'
tarballfn = pathlib.Path('data/recycle_data_shuffled.tar.gz')
npzfn = pathlib.Path('data/recycle_data_shuffled.npz')

if datadir_recycle.exists() and datadir_recycle_small.exists():
    print(f'{datadir_recycle} and {datadir_recycle_small} already exist, skipping download of recycle dataset')
else:
    if not npzfn.exists():
        if not tarballfn.exists():
            print(f'downloading {url}....')
            urllib.request.urlretrieve(url, tarballfn)
            print('done')
        else:
            print(f'{tarballfn} exists, skipping download')

        print(f'extracting data from {tarballfn}')
        tar = tarfile.open(tarballfn, "r:gz")
        tar.extractall(path='data')
        tar.close()
        split_recycle_data(npzfn)
    else:
        print(f'{npzfn} exists, skipping tarball download')

if tarballfn.exists():
    print(f'cleaning up {tarballfn}')
    tarballfn.unlink()

if npzfn.exists():
    print(f'cleaning up {npzfn}')
    npzfn.unlink()


data/recycle and data/recycle-small already exist, skipping download of recycle dataset


In [4]:
!pip install opendatasets --upgrade --quiet
import opendatasets as od

In [5]:
origdir = pathlib.Path('./garbage-classification/Garbage classification/Garbage classification')
targetdir = pathlib.Path('data/garbage')

if targetdir.exists():
    print(f'{targetdir} exists, skipping download and prep of garbage dataset')
else:
    od.download('https://www.kaggle.com/asdasdasasdas/garbage-classification')

    targetdir.mkdir()

    for setn in ['train','test', 'val']:
       (targetdir / setn).mkdir()
    #20% for test
    #then 20% of the remaining for val

    for dirn in origdir.iterdir():
        print(f'doing {dirn}')

        for setn in ['train','test', 'val']:
            (targetdir / setn / dirn.name).mkdir()
        for fn in dirn.iterdir():
            rnd = random.random()
            if rnd < 0.2:
                #move it to test
                setn = 'test'
            elif rnd < 0.36: # 20% of 80%
                #move it to val
                setn = 'val'
            else:
                #move it to test
                setn = 'train'
            fn.rename(targetdir / setn / dirn.name / fn.name)

    shutil.rmtree('./garbage-classification')

        

data/garbage exists, skipping download and prep of garbage dataset


In [7]:
#now download the pre-trained models and histories
pathlib.Path('models').mkdir(exist_ok=True)
pathlib.Path('histories').mkdir(exist_ok=True)
for filest in ['inception-garbage-7', 'inception-recycle-7', 'inception-recycle-small-7', 'resnet-garbage-3', 'resnet-recycle-3', 'resnet-recycle-small-3']:
    for filep in [pathlib.Path(f'models/{filest}.model'), pathlib.Path(f'histories/{filest}.history')]:
        print(filep)
    
        if not filep.exists():
            urllib.request.urlretrieve(f'https://recycle-classifier-models.s3.eu-west-2.amazonaws.com/{filep}', filep)
   

models/inception-garbage-7.model
histories/inception-garbage-7.history
models/inception-recycle-7.model
histories/inception-recycle-7.history
models/inception-recycle-small-7.model
histories/inception-recycle-small-7.history
models/resnet-garbage-3.model
histories/resnet-garbage-3.history
models/resnet-recycle-3.model
histories/resnet-recycle-3.history
models/resnet-recycle-small-3.model
histories/resnet-recycle-small-3.history
