In [1]:
import zipfile
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import shutil


In [3]:
root = Path('.')
img_path = root / 'images'
mod_path = root / 'others'
annot_path = root / 'annotations.csv'

In [32]:
assert not any(map(lambda x: x.exists(), [img_path, mod_path, annot_path])), \
"""
Preprocess is already executed, please remove ./images/, ./others/ and 
./annotations.csv to proceed.
"""

In [33]:
img_path.mkdir()
mod_path.mkdir()

In [4]:
archives = list(filter(lambda x: x.name.endswith('.zip'), root.iterdir()))
table = pd.read_csv('./Pulse-shapes_annotated_CS_images.csv')
annotations = pd.DataFrame({'X': [], 'class_name': [], 'has_image': [], 'has_fluorence': []})

In [7]:
table.dropna().X.unique()

array([    2,     3,     4, ..., 14288, 14289, 14290], shape=(6613,))

In [35]:
# unzip the archives
for archive in tqdm(archives, desc='Unzipping images... ', ncols=80):
    with zipfile.ZipFile(archive, 'r') as zip_ref:
        zip_ref.extractall(img_path)

# flatten the nested image directories
# for img_file in tqdm(list(img_path.rglob('*.jpg')), desc='Flatten the image directory', ncols=80):
    # img_file.rename(img_path / img_file.name)

# remove empty directories
# for path in filter(lambda x: x.is_dir(), img_path.rglob('*')):
    # shutil.rmtree(path)

Unzipping images... : 100%|█████████████████████| 38/38 [00:02<00:00, 13.96it/s]


In [48]:
from_dir = set(filter(lambda x: not x.is_dir(), img_path.rglob('*')))
from_dir = set(map(str, from_dir))
from_dir

{'images/UtoF_FLR_L 2020-07-13 10h16_Cropped_With_Scalebar_631.jpg',
 'images/spp/UtoF_FLR_L 2020-07-11 23h16_Cropped_With_Scalebar_1292.jpg',
 'images/Plagioselmis/UtoF_FLR_L 2020-07-09 23h20_Cropped_With_Scalebar_707.jpg',
 'images/UtoF_FLR_L 2020-07-15 12h16_Cropped_With_Scalebar_1016.jpg',
 'images/UtoF_FLR_L 2020-07-10 00h20_Cropped_With_Scalebar_787.jpg',
 'images/UtoF_FLR_L 2020-07-10 02h20_Cropped_With_Scalebar_286.jpg',
 'images/UtoF_FLR_L 2020-07-08 22h40_Cropped_With_Scalebar_1333.jpg',
 'images/T_amphioxeia/UtoF_FLR_L 2020-07-11 14h16_Cropped_With_Scalebar_1115.jpg',
 'images/Plagioselmis/UtoF_FLR_L 2020-07-12 04h16_Cropped_With_Scalebar_2256.jpg',
 'images/Plagioselmis/UtoF_FLR_L 2020-07-11 23h16_Cropped_With_Scalebar_153.jpg',
 'images/spp/UtoF_FLR_L 2020-07-11 19h16_Cropped_With_Scalebar_1439.jpg',
 'images/UtoF_FLR_L 2020-07-09 15h40_Cropped_With_Scalebar_291.jpg',
 'images/spp/D208_FLR_L 2020-08-01 06h08_Cropped_With_Scalebar_524.jpg',
 'images/UtoF_FLR_L 2020-07-31 23

In [53]:
from_table = set(map(lambda x: 'images/' + x + '.jpg', table.file_id))
from_table

{'images/UtoF_FLR_L 2020-07-13 10h16_Cropped_With_Scalebar_631.jpg',
 'images/Plagioselmis/UtoF_FLR_L 2020-07-09 23h20_Cropped_With_Scalebar_707.jpg',
 'images/spp/UtoF_FLR_L 2020-07-11 23h16_Cropped_With_Scalebar_1292.jpg',
 'images/UtoF_FLR_L 2020-07-15 12h16_Cropped_With_Scalebar_1016.jpg',
 'images/UtoF_FLR_L 2020-07-10 00h20_Cropped_With_Scalebar_787.jpg',
 'images/Plagioselmis/UtoF_FLR_L 2020-07-12 04h16_Cropped_With_Scalebar_2256.jpg',
 'images/UtoF_FLR_L 2020-07-10 02h20_Cropped_With_Scalebar_286.jpg',
 'images/T_amphioxeia/UtoF_FLR_L 2020-07-11 14h16_Cropped_With_Scalebar_1115.jpg',
 'images/UtoF_FLR_L 2020-07-08 22h40_Cropped_With_Scalebar_1333.jpg',
 'images/Plagioselmis/UtoF_FLR_L 2020-07-11 23h16_Cropped_With_Scalebar_153.jpg',
 'images/spp/UtoF_FLR_L 2020-07-11 19h16_Cropped_With_Scalebar_1439.jpg',
 'images/UtoF_FLR_L 2020-07-09 15h40_Cropped_With_Scalebar_291.jpg',
 'images/spp/D208_FLR_L 2020-08-01 06h08_Cropped_With_Scalebar_524.jpg',
 'images/UtoF_FLR_L 2020-07-31 23

In [54]:
len(from_table - from_dir)

20

In [None]:

# preprocess using the table
# for x in tqdm(table.X.unique(), desc='Processing the table... ', ncols=80):
#     samples = table[table.X == x]
#     file_id = samples.file_id.iloc[0]
#     info = {'X': x, 'class_name': samples.classes.iloc[0],
#            'has_image': False, 'has_fluorence': False}
#     if (img_path / f'{file_id}.jpg').exists():
#         info['has_image'] = True
#         (img_path / f'{file_id}.jpg').rename(img_path / f'{x}.jpg')
#     other = samples.iloc[:, -5:]
#     if not other.iloc[0].isna().any():
#         info['has_fluorence'] = True
#         other.to_csv(mod_path / f'{x}.csv', index=False)
#
# annotations.to_csv('annotations.csv')






In [None]:

# preprocess using the table
# for x in tqdm(table.X.unique(), desc='Processing the table... ', ncols=80):
#     samples = table[table.X == x]
#     file_id = samples.file_id.iloc[0]
#     info = {'X': x, 'class_name': samples.classes.iloc[0],
#            'has_image': False, 'has_fluorence': False}
#     if (img_path / f'{file_id}.jpg').exists():
#         info['has_image'] = True
#         (img_path / f'{file_id}.jpg').rename(img_path / f'{x}.jpg')
#     other = samples.iloc[:, -5:]
#     if not other.iloc[0].isna().any():
#         info['has_fluorence'] = True
#         other.to_csv(mod_path / f'{x}.csv', index=False)
#
# annotations.to_csv('annotations.csv')




