# Import the libraries to use

In [1]:
#Import system manupulation libraries
import os
import shutil

#Import data manipulation libraries
import numpy as np
import pandas as pd

# Division of data
The data that will be used in the experiment was found in Kaggle (2019) in a package called "aerial-cactus-identification.zip". The data was extracted in the path "../data/raw" and it contained:
* Two zip folders called "train.zip" and "test.zip", both containing the images recollected by <cite >(E. López-Jiménez, et al., 2019)</cite>
* A csv file called "train.csv", which contains the list of images names with the class it corresponds to, from the "train.zip" package.
* A csv file called "sample_submission.csv", which the names of the images in "test.zip"
Both, "test.zip" and "sample_submission.csv" will be both ignored for this project.

For dividing the data, we will start by:
* Extracting the "train.zip" package.
* Reading the "train.csv" file in order to look at the data which will be use for the experiments:

In [2]:
images_df = pd.read_csv('../data/raw/train.csv')
images_df.head()

Unnamed: 0,id,has_cactus
0,0004be2cfeaba1c0361d39e2b000257b.jpg,1
1,000c8a36845c0208e833c79c1bffedd1.jpg,1
2,000d1e9a533f62e55c289303b072733d.jpg,1
3,0011485b40695e9138e92d0b3fb55128.jpg,1
4,0014d7a11e90b62848904c1418fc8cf2.jpg,1


The columns can be explained like this:
* id: Name of the image
* has_cactus: A flag that indicates either if the image contains a cactus or not, being 0 = it doesn't have a cactus and 1 = has a cactus

## Separation of datasets
The images of the train package will be divided in two main dataframes called "validation" and "experiment". This division is done in order to have a control group of classified images for validation against overfitting as well as, the group of images in which the the experiment will be run.

To facilitate the division of dataframes `separate_dataframes` function was created. The function will separate the provided dataset in two dataframes based on the `separation_percentage` and the resulting dataframes will be shuffled if indicated:

In [3]:
def separate_dataframes(dataframe, separation_percentage, shuffle=False):
    manual_seed = 1
    separation_marker = int(len(dataframe) * separation_percentage)
    
    if shuffle:
        dataframe = dataframe.sample(frac=1, random_state = manual_seed)
    
    first_dataframe = dataframe.iloc[0:separation_marker]
    second_dataframe = dataframe.iloc[separation_marker:len(dataframe)]
    
    return first_dataframe, second_dataframe

### Separation of validation and experimental images
For the division of the validation and the experimental images group, a 0.1 percentage will be used, so the division will be:
* The validation group will be of the 10% of the provided images.
* The experiment group will consist on the 90% if the provided images.
* The groups will be shuffled

In [4]:
validation_images_df, experiment_images_df = separate_dataframes(images_df, 0.0, shuffle=True)
validation_images_df.shape, experiment_images_df.shape

((0, 2), (17500, 2))

In [5]:
experiment_images_df.head()

Unnamed: 0,id,has_cactus
6670,60033b4a196a4bc6510333a352a4f915.jpg,0
15889,e7df86ab6661be24a052aeb0a953746d.jpg,0
14851,d842e05b4cf1ca8f2f97584a663d1090.jpg,0
6946,641ef647b27a4ee4ba179b6f25426ac6.jpg,1
14933,d97a92fca0f0dff8df07fb1b3b752f7b.jpg,1


In [6]:
validation_images_df.head()

Unnamed: 0,id,has_cactus


### Separation of training and test images
The experiment images will also be divided into training and test, for that a 0.2 percent will be used for the division, and the groups will consist on:
* Test group will consist on the 20% of the experiment images.
* Train group will consist on the 80% of the experiment images.
* Groups will not be shuffled.

In [7]:
test_df, train_df = separate_dataframes(experiment_images_df, 0.2)
test_df.shape, train_df.shape

((3500, 2), (14000, 2))

In [8]:
test_df.head()

Unnamed: 0,id,has_cactus
6670,60033b4a196a4bc6510333a352a4f915.jpg,0
15889,e7df86ab6661be24a052aeb0a953746d.jpg,0
14851,d842e05b4cf1ca8f2f97584a663d1090.jpg,0
6946,641ef647b27a4ee4ba179b6f25426ac6.jpg,1
14933,d97a92fca0f0dff8df07fb1b3b752f7b.jpg,1


In [9]:
train_df.head()

Unnamed: 0,id,has_cactus
8435,7923561977ce2e3df6b4516210e513e2.jpg,1
12861,ba6690fce78bb36f059f3f4f2bdab4b2.jpg,1
12795,b9446fd0e854e4dcf8c39de917737ea9.jpg,1
5476,4ec98191dfd59040d2fbb180b8530cac.jpg,1
17195,fb695b75766192610a28b45a647a7ab2.jpg,1


# Export images by category
To preserve the original set of images, the images will be transfered into the folder path "../data/interim".

The export process will be done based on the division of the dataset previously done. For that, the following paths were defined:

In [10]:
#Get relative directory
project_dir = os.path.dirname(os.getcwd())

#New paths for the images
original_path = os.path.join(project_dir,'data/raw/train/')
new_val_path = os.path.join(project_dir,'data/interim/validation')
new_test_path = os.path.join(project_dir,'data/interim/experiment/test/')
new_train_path = os.path.join(project_dir,'data/interim/experiment/train/')

For exporting the images, the dataset with the images names will be provided with the original and new path where they will be exported. If a flag is provided, the images:
* New folders will also be added with the names 'has_cactus' and 'no_cactus'. This is done in order to take advantage of a tool that will be use when the images are loaded.
* The images will be divided into these new folders depending on the value of the 'has_cactus' column in the dataset.

In [11]:
def export_images(dataset, original_path, new_path, flag=None):
    if flag is not None:
        category1_path = os.path.join(new_path, 'has_cactus')
        os.makedirs(category1_path)
        print(f'Dif: {category1_path} created.')
        category2_path = os.path.join(new_path, 'no_cactus')
        os.makedirs(category2_path)
        print(f'Dif: {category2_path} created.')
    for row in dataset.itertuples(index=False):
        if flag is None:
            shutil.copyfile(os.path.join(original_path, row[0]), os.path.join(new_path, row[0]))
        elif row[1] == flag:
            shutil.copyfile(os.path.join(original_path, row[0]), os.path.join(category1_path, row[0]))
        else:
            shutil.copyfile(os.path.join(original_path, row[0]), os.path.join(category2_path, row[0]))

## Validation Images Export
Exporting the image control group into the "validation" path

In [12]:
import time
start_time = time.time()

os.makedirs(new_val_path)
export_images(validation_images_df, original_path, new_val_path)

finish_time = time.time()
print(f'Validation export lasted: {finish_time-start_time} seconds.')

Validation export lasted: 0.003888845443725586 seconds.


## Experiment Images Export

### Training Images Export
Exporting the images to be used in training the model.

In [13]:
import time
start_time = time.time()

os.makedirs(new_train_path)
export_images(train_df, original_path, new_train_path, flag=1)

finish_time = time.time()
print(f'Validation export lasted: {finish_time-start_time} seconds.')

Dif: /home/jascrer/TFM/TFM_DL_CNN_Codigo_SeguraCampos/tfm_dl-cnn_codigo_seguracampos/data/interim/experiment/train/has_cactus created.
Dif: /home/jascrer/TFM/TFM_DL_CNN_Codigo_SeguraCampos/tfm_dl-cnn_codigo_seguracampos/data/interim/experiment/train/no_cactus created.
Validation export lasted: 93.21837615966797 seconds.


### Test Images Export
Exporting the images to be used in testing the model.

In [14]:
import time
start_time = time.time()

os.makedirs(new_test_path)
export_images(test_df, original_path, new_test_path, flag=1)

finish_time = time.time()
print(f'Validation export lasted: {finish_time-start_time} seconds.')

Dif: /home/jascrer/TFM/TFM_DL_CNN_Codigo_SeguraCampos/tfm_dl-cnn_codigo_seguracampos/data/interim/experiment/test/has_cactus created.
Dif: /home/jascrer/TFM/TFM_DL_CNN_Codigo_SeguraCampos/tfm_dl-cnn_codigo_seguracampos/data/interim/experiment/test/no_cactus created.
Validation export lasted: 23.75086784362793 seconds.


# References
* Aerial Cactus Identification. (2019, 8 march). Kaggle. from https://www.kaggle.com/c/aerial-cactus-identification/overview
* López-Jiménez, Efren; Vasquez-Gomez, Juan Irving; Sanchez-Acevedo, Miguel Angel; Herrera-Lozada, Juan Carlos; Uriarte-Arcia, Abril Valeria (2019); “Columnar Cactus Recognition in Aerial Images using a Deep Learning Approach”. Ecological Informatics. 52. 131-138.