In [46]:
import os
import glob
import numpy as np
import pandas as pd

In [21]:
image_folder = "../data/image"
mask_folder = "../data/mask"

## 1. upload image & mask files

### 1.1 list of image files

In [122]:
data_image_list = []
for filename in sorted(os.listdir(image_folder)):
    ## filename = "20171027_ANIMAL2_0min.jpg"
    date, animal, minute = filename.split("_") # ["20171027", "ANIMAL2", "0min.jpg"]   
    animal = animal[6:] # convert "ANIMAL2" to "2"
    minute = minute.split(".")[0][:-3] # convert "0min.jpg" to "0"
    name = date + "_ANIMAL" + animal + "_" + minute + "min"
    data_image_list.append([name, date, animal, minute])

In [123]:
data_image = pd.DataFrame(data_image_list, columns = ("file_name","date", "animal", "minute"))

In [124]:
data_image.to_csv("../result/total_image_file_list.csv")

### 1.2 list of mask files

In [125]:
data_mask_list = []

for filename in sorted(os.listdir(mask_folder)):
    # filename = "20171027_ANIMAL2_0min_ROI.jpg"
    date, animal, minute, _ = filename.split("_") # # ["20171027", "ANIMAL2", "0min", "ROI.jpg"]   
    
    animal = animal[6:] # convert "ANIMAL2" to "2"
    minute = minute.split(".")[0][:-3] # convert "0min" to "0"
    name = date + "_ANIMAL" + animal + "_" + minute + "min"
    
    data_mask_list.append([name, date, animal, minute])

In [126]:
data_mask = pd.DataFrame(data_mask_list, columns = ("file_name","date", "animal", "minute"))

In [127]:
data_mask.to_csv("../result/total_mask_file_list.csv")

### 1.3 address inconsistent between image and mask

In [128]:
print(data_image.shape)
print(data_mask.shape)

(2420, 4)
(2420, 4)


In [129]:
data_image.head()

Unnamed: 0,file_name,date,animal,minute
0,20171027_ANIMAL2_0min,20171027,2,0
1,20171027_ANIMAL2_10min,20171027,2,10
2,20171027_ANIMAL2_11min,20171027,2,11
3,20171027_ANIMAL2_12min,20171027,2,12
4,20171027_ANIMAL2_13min,20171027,2,13


In [130]:
data_mask.head()

Unnamed: 0,file_name,date,animal,minute
0,20171027_ANIMAL2_0min,20171027,2,0
1,20171027_ANIMAL2_10min,20171027,2,10
2,20171027_ANIMAL2_11min,20171027,2,11
3,20171027_ANIMAL2_12min,20171027,2,12
4,20171027_ANIMAL2_13min,20171027,2,13


In [131]:
### files in image but not in mask
data_image["file_name"][data_image["file_name"].isin(data_mask["file_name"]) == False]

Series([], Name: file_name, dtype: object)

In [132]:
### files in mask but not in image
data_mask["file_name"][data_mask["file_name"].isin(data_image["file_name"]) == False]

Series([], Name: file_name, dtype: object)

### 1.4 Number of animals (166)

In [136]:
df = data_image.groupby(["date", "animal"]).count()

# 2. train test split

total image/mask pair: 2420

from 166 animals

### 2.1 generate list of validation data

In [None]:
seed = 12345

In [171]:
### sample by animals as validation set
df_sample = df.sample(frac = 0.1, random_state = seed)

In [177]:
## number of validation files
print(df_sample.sum())

file_name    213
minute       213
dtype: int64


In [174]:
df_sample.to_csv("../result/validation_data_list.csv")

In [60]:
df_sample.shape

(17, 4)

### 2.2 move the files into folders
image/image_train, image/image_validate, mask/mask_train, mask/mask_validate

In [2]:
df_sample = pd.read_csv(("../result/validation_data_list.csv"))

In [10]:
df_sample = df_sample.sort_values(by = "date")

In [24]:
sample_name = set(df_sample["date"].astype(str) + "_ANIMAL" + df_sample["animal"].astype(str))

In [25]:
sample_name

{'20171103_ANIMAL6',
 '20180509_ANIMAL2',
 '20180514_ANIMAL5',
 '20180515_ANIMAL2',
 '20180801_ANIMAL2',
 '20180802_ANIMAL3',
 '20180807_ANIMAL5',
 '20180821_ANIMAL3',
 '20180910_ANIMAL5',
 '20180915_ANIMAL1',
 '20181114_ANIMAL10',
 '20181114_ANIMAL12',
 '20181114_ANIMAL6',
 '20181114_ANIMAL8',
 '20181127_ANIMAL2',
 '20181208_ANIMAL2',
 '20181213_ANIMAL7'}

In [41]:
x_train_folder = "../data/image/image_train"
x_test_folder = "../data/image/image_validate"
y_train_folder = "../data/mask/mask_train"
y_test_folder = "../data/mask/mask_validate"

### image files

In [55]:
for file in glob.glob(image_folder + "/" + "*.jpg"): ## return full path
    filename = os.path.basename(file)
    ## check if the file belongs to validation data
    if any(name in filename for name in sample_name):     
        destination_dir = x_test_folder + "/" + filename
    else:
        destination_dir = x_train_folder + "/" + filename   
    os.rename(file,destination_dir)

### mask files

In [56]:
for file in glob.glob(mask_folder + "/" + "*.jpg"): ## return full path
    filename = os.path.basename(file)
    ## check if the file belongs to validation data
    if any(name in filename for name in sample_name):     
        destination_dir = y_test_folder + "/" + filename
    else:
        destination_dir = y_train_folder + "/" + filename   
    os.rename(file,destination_dir)

### 2.3 sanity check

#### file numbers

In [59]:
print("Number of x_train data:",len([file for file in os.listdir(x_train_folder)]))
print("Number of x_test data:",len([file for file in os.listdir(x_test_folder)]))
print("Number of y_train data:",len([file for file in os.listdir(y_train_folder)]))
print("Number of y_test data:",len([file for file in os.listdir(y_test_folder)]))

Number of x_train data: 2207
Number of x_test data: 213
Number of y_train data: 2207
Number of y_test data: 213
