# The goal of this notebook is to make BirdSnap fit in 10GB

In [1]:
from datasets import load_dataset

In [5]:
import sys
print( sys.version )


3.10.15 (main, Sep  7 2024, 18:35:33) [GCC 9.4.0]


In [14]:
import os

_DATASET_FOLDR = "data"
_DATASET_INAME = "birdsnapLite.hf"
_DATASET_LNAME = "birdsnapLite_Final"
_DATASET_IPATH = os.path.join( _DATASET_FOLDR, _DATASET_INAME )
_DATASET_LPATH = os.path.join( _DATASET_FOLDR, _DATASET_LNAME )

In [None]:
# ds = load_dataset( "sasha/birdsnap" )
ds = load_dataset( "isaacchung/birdsnap" )
print( type( ds ) )
print( sys.getsizeof( ds ) )

In [None]:
ds.save_to_disk( _DATASET_IPATH )

In [10]:
import json
from datasets import Dataset, load_from_disk
import torchvision.transforms as T

_IMG_SIZ = (64, 64)


def crop_and_rescale( img_tensor, bbox, size = _IMG_SIZ ):
    """ Crop the region inside the bounding box and resize it to the given size. """
    # Extract the bounding box coordinates
    x_min, y_min, x_max, y_max = bbox

    # Crop the image tensor to the bounding box region
    cropped_img = img_tensor[:, y_min:y_max, x_min:x_max]  # Cropping the image
    
    # Resize the cropped image to the target size (64x64)
    resize_transform = T.Resize( size )
    resized_img = resize_transform( cropped_img )
    
    return resized_img
    

class BSL_Maker( Dataset ):
    """ Load the BirdSnap Dataset in order to reduce it """
    
    def __init__( self, train = True ):
        """ Load the local dataset to be reduced """
        trnInfo = None
        tstInfo = None
        valInfo = None
        with open( f"{_DATASET_IPATH}/train/dataset_info.json" ) as json_data:
            trnInfo = json.load( json_data )
        with open( f"{_DATASET_IPATH}/test/dataset_info.json" ) as json_data:
            tstInfo = json.load( json_data )
        with open( f"{_DATASET_IPATH}/val/dataset_info.json" ) as json_data:
            valInfo = json.load( json_data )
        
        self.dataset   = load_from_disk( _DATASET_IPATH )
        self.transform = T.ToTensor()
        self.labels    = {
            'train' : { 'common' : trnInfo['features']['common']['names'], 'scientific' : trnInfo['features']['scientific']['names'] },
            'test'  : { 'common' : tstInfo['features']['common']['names'], 'scientific' : tstInfo['features']['scientific']['names'] },
            'val'   : { 'common' : valInfo['features']['common']['names'], 'scientific' : valInfo['features']['scientific']['names'] },
        }

    
    def len_split( self, splitName ):
        """ Return the number of examples """
        return self.dataset.num_rows[ splitName ]


    def get_split_item( self, i, splitName = "train" ):
        """ Fetch an example from the dataset """
        img = self.dataset[ splitName ][i]["image"]
        img = self.transform( img )
        
        bb  = [ self.dataset[ splitName ][i]["bb_x1"], self.dataset[ splitName ][i]["bb_y1"], 
                self.dataset[ splitName ][i]["bb_x2"], self.dataset[ splitName ][i]["bb_y2"], ]
        img = crop_and_rescale( img, bb, size = _IMG_SIZ )
        
        return {
            "image"         : img.detach().clone(), 
            'common'        : self.labels[ splitName ]["common"][ self.dataset[ splitName ][i]["common"] ], 
            'scientific'    : self.labels[ splitName ]["scientific"][ self.dataset[ splitName ][i]["scientific"] ], 
            'common_idx'    : self.dataset[ splitName ][i]["common"], 
            'scientific_idx': self.dataset[ splitName ][i]["scientific"], 
        }


In [19]:
bsl = BSL_Maker()
print( bsl.len_split( "train" ) )
print( bsl.len_split( "test" ) )
print( bsl.len_split( "val" ) )

Loading dataset from disk:   0%|          | 0/133 [00:00<?, ?it/s]

38098
1851
7


In [20]:
import sys, time
now = time.time

_SKIP_DIV =  1
_SHOW_DIV = 50

data = {}
elemNames = ["image", 'common', 'scientific', 'common_idx', 'scientific_idx']

bgn = now()
for partName in ["train", "test", "val"]:
    print( partName, end = '', flush = True )
    N = bsl.len_split( partName )
    data[ partName ] = {
        "image"         : list(), 
        'common'        : list(), 
        'scientific'    : list(), 
        'common_idx'    : list(), 
        'scientific_idx': list(), 
    }
    for i in range(N):
        if ((i % _SKIP_DIV) == 0):
            try: 
                item = bsl.get_split_item( i, partName )
                for elemNam in elemNames:
                    data[ partName ][ elemNam ].append( item[ elemNam ] )
            except Exception as e:
                print( f"\nSkipped item {i} because {e}" )
        if ((i % _SHOW_DIV) == 0):
            print( '.', end = '', flush = True )
    print()
end = now()
print( f"Reduction process took {(end-bgn)/60.0/60.0} hours!" )

train....................................................................................................................................................................................
Skipped item 8964 because image file is truncated (45 bytes not processed)
.........................................................................................................................................................
Skipped item 16648 because Input and output sizes should be greater than 0, but got input (H: 216, W: 0) output (H: 64, W: 64)
....................................................................................................................................................



.........................................................................................................................................................................................................................................................................................
test......................................
val.
Reduction process took 3.8842570026053322 hours!


In [21]:
train_dataset = Dataset.from_dict( data["train"] )
test_dataset  = Dataset.from_dict( data["test"] )
val_dataset   = Dataset.from_dict( data["val"] )

In [22]:
from datasets import DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test' : test_dataset ,
    'val'  : val_dataset  ,
})

In [23]:
dataset_dict.save_to_disk( _DATASET_LPATH )

Saving the dataset (0/4 shards):   0%|          | 0/38096 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1851 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7 [00:00<?, ? examples/s]

In [26]:
# term$> huggingface-cli login
# Make sure that the token has WRITE permissions!
dataset_dict.push_to_hub( "jwatson-CO-edu/birdsnap_lite" )

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/223 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jwatson-CO-edu/birdsnap_lite/commit/9bcf097b2dd9ab1af578cf6d927446baa83888b3', commit_message='Upload dataset', commit_description='', oid='9bcf097b2dd9ab1af578cf6d927446baa83888b3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/jwatson-CO-edu/birdsnap_lite', endpoint='https://huggingface.co', repo_type='dataset', repo_id='jwatson-CO-edu/birdsnap_lite'), pr_revision=None, pr_num=None)