In [175]:
import torch
import torchvision
import pandas as pd
from skimage import io
import os
from torch.utils.data import Dataset

# set where you want to store the data here, we will make it for you. Do not include a last slash.
data_folder = '/home/ubuntu/brackish_data'
#if the data folder does not exist then make the folder
if not os.path.exists(data_folder):
    !mkdir $data_folder
    print('We made the folder for you:',data_folder)
else: 
    print('The folder already existed')

# base = '/home/ubuntu/Brackish/'
# annot = 'annotations/annotations_AAU/train.csv'

The folder already existed


#### Download Data
Kaggle download links do not work with wget. There is also no good api for downlopading through the commandline. So download the brackish dataset and place it in *data_folder*. You should have a file called *archive.zip*. You can upload it with the jupyter notebook client. 

https://www.kaggle.com/datasets/aalborguniversity/brackish-dataset

#### Unzip Data
Once downloaded, the next cell will unzip it for you.

In [178]:
#unzipping the file
unzipped = os.path.exists(data_folder+'/archive')
if not unzipped:
    !unzip -q $data_folder/archive.zip -d $data_folder/archive
    print('we have unzipped it for you')
else: print('The folder [archive] exists, so we think archive.zip is already unzipped')

The folder [archive] exists, so we think archive.zip is already unzipped


#### Extracting images from videos
The creators of the kaggle dataset have provided a script for extracting the images from the video files. It assigns each image the correct file name that is used in the provided annotations. 

In [150]:
# add the folder to system directory so we can access their script
# You may need to install some dependancies
#ffmpeg, conda install -c conda-forge ffmpeg
#ipdb, conda install -c conda-forge ipdb
import sys 
# adding Folder_2 to the system path
sys.path.insert(0, data_folder+'/archive/scripts')
from frameExtractor import extractFrames

In [173]:
# !rm -R $data_folder/images
![-d "$data_folder"]

/bin/bash: [-d: command not found


In [182]:
frames_extracted = os.path.exists(data_folder+'/images')
folders = ['crab','fish-big','fish-school','fish-small-shrimp','jellyfish']
if not frames_extracted:
    for fol in folders:
        extractFrames({'inputFolder':data_folder+'/archive/dataset/videos/'+fol, 'outputFolder':data_folder+'/images'});
    print('We have extracted all the frames for you and placed them in:', data_folder+'/images')
else:
    print('The folder',data_folder+'/images','already exists. So we assumed the images are already extracted.')

The folder /home/ubuntu/brackish_data/images already exists. So we assumed the images are already extracted.


#### Make final data folder
To make things simple we copy the necessary files for pytorch to one folder and leave the rest.

In [184]:
!mkdir ~/brackishData
!cp -R $data_folder/images ~/brackishData
!cp -R $data_folder/annotations/annotations_AUU ~/brackishData

mkdir: cannot create directory ‘/home/ubuntu/brackishData’: File exists
cp: cannot stat '/home/ubuntu/brackish_data/annotations/annotations_AUU': No such file or directory


resources used to make the dataset  
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html  
https://www.youtube.com/watch?v=ZoZHd0Zm3RY  
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html   
https://github.com/pytorch/vision.git

#### Making the class
The standard way to load data into pytorch is with a custom data class. I used the following resources to construct this class.  
  
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html  
https://www.youtube.com/watch?v=ZoZHd0Zm3RY   
https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html  
https://github.com/pytorch/vision.git   

This will be in a brackishData.py file that will be in the git repo to be imported into your notebooks.

Some examples for how to use the class are given after it is made.

This in theory could be used in the place of standard pytorch data sets you will find in tutorials.

In [47]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from skimage import io

class brackish_data(Dataset):
    label_key = {'fish':0,'small_fish':1,'crab':2,'shrimp':3,'jellyfish':4,'starfish':5}
    def __init__(self, annotations_file, images_folder, transform=None):
        cnames = ['filename', 'object_id', 'label', 'up_left_x','up_left_y','low_right_x','low_right_y']
        self.annotations = pd.read_csv(annotations_file, delimiter=';',skiprows=1,names=cnames)
        
        self.filenames = self.annotations['filename'].drop_duplicates()
        self.images_folder = images_folder
        self.transform = transform
        
        ulx, uly, lrx, lry = self.annotations['up_left_x'], self.annotations['up_left_y'], self.annotations['low_right_x'], self.annotations['low_right_y']
        self.annotations['area'] = (lrx - ulx) * (lry - uly)
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self,index):
        filename = self.filenames.iloc[index]
        img_path = os.path.join(self.images_folder,filename)
        image = io.imread(img_path)
        
        annotations_filtered = self.annotations[self.annotations['filename']==filename]
        
        boxes = annotations_filtered[['up_left_x','up_left_y','low_right_x','low_right_y']]
        #might need to add torch.cuda later when cuda is enabled
        boxes = torch.FloatTensor(boxes.values.tolist())
        
        labels = annotations_filtered['label'].tolist()
        labels_int = [label_key[lbl] for lbl in labels]
        #might need to add torch.cuda later when cuda is enabled
        labels_int = torch.LongTensor(labels_int)
        
        area = torch.tensor(annotations_filtered['area'].tolist())
        
        target = {'boxes':boxes,'labels':labels_int,'image_id':filename,'area':area,'iscrowd':False}
        
        if self.transform:
            image = self.transform(image)
        
        return image, target

data_folder = '/home/ubuntu/brackishData'
train_data = brackish_data(annotations_file=data_folder+'/annotations_AUU/train.csv', images_folder=data_folder+'images/',transform = torchvision.transforms.ToTensor())
test_data = brackish_data(annotations_file=data_folder+'/annotations_AUU/test.csv', images_folder=data_folder+'images/',transform = torchvision.transforms.ToTensor())
valid_data = brackish_data(annotations_file=data_folder+'/annotations_AUU/valid.csv', images_folder=data_folder+'images/',transform = torchvision.transforms.ToTensor())
