# Preparing the dataset.
1) Extraction of the various files


2) Converting the data into format readable by Detectron2


3) Splitting the data up into training and validation.

## Import and linking to drive

In [1]:
!python -m pip install pyyaml==5.1
import sys, os, distutils.core
# Note: This is a faster way to install detectron2 in Colab, but it does not include all functionalities.
# See https://detectron2.readthedocs.io/tutorials/install.html for full installation instructions
#!git clone 'https://github.com/facebookresearch/detectron2'
#dist = distutils.core.run_setup("./detectron2/setup.py")
#!python -m pip install {' '.join([f"'{x}'" for x in dist.install_requires])}
#sys.path.insert(0, os.path.abspath('./detectron2'))

# Properly install detectron2. (Please do not install twice in both ways)
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/facebookresearch/detectron2.git
  Cloning https://github.com/facebookresearch/detectron2.git to /tmp/pip-req-build-ir6nf77m
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /tmp/pip-req-build-ir6nf77m
  Resolved https://github.com/facebookresearch/detectron2.git to commit 38af375052d3ae7331141bc1a22cfa2713b02987
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import torch, detectron2
!nvcc --version
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0
torch:  1.13 ; cuda:  cu116


In [3]:
import shutil
import os
import pandas as pd
import glob
import pickle
import xml.etree.ElementTree as ET

from detectron2.structures import BoxMode 

from sklearn.model_selection import train_test_split


In [4]:
#from google.colab import drive

#Mounting google drive to retrieve outcomes
#drive.mount('/content/drive/')
#Change to your own directory
#os.chdir('/content/drive/My Drive/Colab Notebooks/AppliedCV/')

In [5]:
!sudo apt-get install google-drive-ocamlfuse
!sudo add-apt-repository ppa:alessandro-strada/ppa
!sudo apt-get update
!sudo apt-get install google-drive-ocamlfuse

Reading package lists... Done
Building dependency tree       
Reading state information... Done
google-drive-ocamlfuse is already the newest version (0.7.30-0ubuntu1~ubuntu20.04.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-510
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.
 Mount Google Drive on Ubuntu (via FUSE)
 More info: https://launchpad.net/~alessandro-strada/+archive/ubuntu/ppa
Press [ENTER] to continue or Ctrl-c to cancel adding it.

Hit:1 http://archive.ubuntu.com/ubuntu focal InRelease
Get:2 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:5 http://ppa.launchpad.net/alessandro-strada/ppa/ubuntu focal InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_6

In [6]:
!google-drive-ocamlfuse -headless -id=**Insert id here** -secret=**Insert secret here**

In [7]:
#Initializing google drive to a folder
if not os.path.exists("./acv"):
  os.makedirs("./acv")

!google-drive-ocamlfuse acv

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


In [10]:
os.chdir('./acv/Colab Notebooks/AppliedCV/')

## Extraction of various files

In [11]:
#Initialization
root = './Dataset/SSDD/'
xml_path = root + 'Annotations_sub/'
jpg_test = root + 'JPEGImages_sub_test/'
jpg_train = root + 'JPEGImages_sub_train/'
dest = root + 'Proc_data/'
imagesets = root + 'Images/'

In [12]:
#Create a new directory because it does not exist
if not os.path.exists(dest):
  os.makedirs(dest)
if not os.path.exists(dest + "train/"):
  os.makedirs(dest + "train/")
if not os.path.exists(dest + "test/"):
  os.makedirs(dest + "test/")
if not os.path.exists(dest + "test_offshore/"):
  os.makedirs(dest + "test_offshore/")
if not os.path.exists(dest + "test_inshore/"):
  os.makedirs(dest + "test_inshore/") 

In [13]:
#Reading csv files into DataFrame.
test = pd.read_csv(imagesets + 'test.txt', header = None, names = ['file'])
train = pd.read_csv(imagesets + 'train.txt', header = None, names = ['file'])
test_offshore = pd.read_csv(imagesets + 'test_offshore.txt', header = None, names = ['file'])
test_inshore = pd.read_csv(imagesets + 'test_inshore.txt', header = None, names = ['file'])

In [14]:
#Function that copies files from given JPEG folders to the dataset folder
def copyFiles(df, orig = None, dest = None):
  i = 0
  for index, rows in df.iterrows():
    i += 1
    if i % 500 == 0:
            print (i)
    shutil.copyfile(orig + rows['file'] + '.jpg',
                    dest + rows['file'] + '.jpg')  
  print("Moved " + str(i) + " files.")

In [15]:
#Sorting out files by moving them to the correct dataset path
#copyFiles(test, jpg_test, dest + 'test/')
copyFiles(train, jpg_train, dest + 'train/')
copyFiles(test_offshore, jpg_test, dest + 'test_offshore/')
copyFiles(test_inshore, jpg_test, dest + 'test_inshore/')

KeyboardInterrupt: ignored

In [None]:
#Function that extracts ground truth information from xml file 
def extract_xml(path):
  """Returns a pandas dataframe containing information from xml files in path"""
  xml_list = []
  for xml_file in glob.glob(path):
      tree = ET.parse(xml_file)
      root = tree.getroot()
      for member in root.findall('object'):
          value = (root.find('filename').text,
                    int(root.find('size')[0].text),
                    int(root.find('size')[1].text),
                    member[0].text,
                    int(member[4][0].text),
                    int(member[4][1].text),
                    int(member[4][2].text),
                    int(member[4][3].text)
                    )
          xml_list.append(value)
  column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
  xml_df = pd.DataFrame(xml_list, columns=column_name)
  return xml_df

In [None]:
labels_df = extract_xml(xml_path + '*.xml')
labels_df.to_csv((root + 'labels.csv'), index=None)

## Converting the data into format readable by Detectron2

In [None]:
#Convert annotations into Detectron's format
def standardize(path, annotations, df, grouped):
  anns_id = [i.split('.')[0] for i in annotations]
  anns_df = pd.DataFrame(anns_id, columns=['file'])

  # Attach correct image file path
  anns_df = df.merge(df, 
                     on = 'file',  
                     how = 'inner')
  
  # Give default values
  std = {i: {
        "file_name": df.loc[df['file'] == i]['type'].values[0] + i +'.jpg',
        "height": 800, # All images of the same dimension
        "width": 800, # All images of the same dimension
        "image_id": i,
        "annotations": []
        } for i in df['file'].values
      }
  
  # Overwrite files with correct annotations
  keys = grouped.groups.keys()
  for i in keys:
    if i.split('.')[0] in std:
      j = grouped.get_group(i)
      temp = []
      for index, row in j.iterrows():
        ann_temp = {
            'bbox': [
                      row['xmin'],
                      row['ymin'],
                      row['xmax'],
                      row['ymax']
                    ],
            'bbox_mode': BoxMode.XYXY_ABS,
            'category_id': 0
            }
        temp.append(ann_temp)
      std[i.split('.')[0]]['annotations'] = temp
  
  # Cache
  f = open(path + "standardDict.pkl", "wb")  # the "wb" mode opens the file in binary format for writing
  pickle.dump(list(std.values()), f)
  f.close()
  
  return std

In [None]:
# Add path to datasets
test['type'] = dest + 'test/'
train['type'] = dest + 'train/'
test_offshore['type'] = dest + 'test_offshore/'
test_inshore['type'] = dest + 'test_inshore/'

In [None]:
# Make standardized annotations
anns = os.listdir(xml_path)
grouped = labels_df.groupby('filename') # used later
test_std = standardize(dest + 'test/', anns, test, grouped)
train_std = standardize(dest + 'train/', anns, train, grouped)
test_o_std = standardize(dest + 'test_offshore/', anns, test_offshore, grouped)
test_i_std = standardize(dest + 'test_inshore/', anns, test_inshore, grouped)

In [None]:
# Function that retrieves a standard dataset compatible with detectron2
def get_dict(type):
  """ Returns a list[dict] containing information about the dataset """
  import pickle
  root = dest + type + '/'
  with open(root + "standardDict.pkl", "rb") as input_file:
    return pickle.load(input_file)

## Splitting the data up into training and validation.

In [None]:
train = get_dict('train')

In [None]:
train_data, val_data = train_test_split(train, train_size=0.8, random_state = 1)

In [None]:
if not os.path.exists(dest + "train/train/"):
  os.makedirs(dest + "train/train/")
if not os.path.exists(dest + "train/val/"):
  os.makedirs(dest + "train/val/")

f = open(dest + 'train/train/' + "standardDict.pkl", "wb")
pickle.dump(train_data, f)
f.close()

f = open(dest + 'train/val/' + "standardDict.pkl", "wb")
pickle.dump(val_data, f)
f.close()

In [None]:
#Prepare toy dataset for quick training/testing
train_data, toy_data = train_test_split(train, train_size=0.99, random_state = 1)

if not os.path.exists(dest + "train/toy/"):
  os.makedirs(dest + "train/toy/")

f = open(dest + 'train/toy/' + "standardDict.pkl", "wb") 
pickle.dump(toy_data, f)
f.close()