In [None]:
#default_exp data_loading

In [None]:
# export
import pandas as pd
import os
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from sklearn import model_selection
import tqdm
import PIL
import numpy as np


In [None]:
%matplotlib inline

In [None]:
# export
def map_class_to_taxon(mapping, class_id, taxon='Family'):
    return mapping.loc[class_id][taxon]


def parse_image(filename, image_size=224):
    image = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(image)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image, [image_size, image_size])
    return image

In [None]:
# export
DATA_DIR = os.path.join('..', 'data')


def get_metadata(data_dir=DATA_DIR):
    metadata_path = os.path.join(data_dir, 'PlantCLEF2019MasterTraining.csv')
    return pd.read_csv(metadata_path, sep=';')
    

In [None]:
metadata_df = get_metadata()

In [None]:
metadata_df.info()

In [None]:
metadata_df.head()

In [None]:
family_counts = metadata_df['Family'].value_counts()
family_counts[family_counts > 1000].sum()

In [None]:
metadata_df['Family'].value_counts()

In [None]:
metadata_df['Genus'].value_counts().hist()

In [None]:
metadata_df['ClassId'].unique().shape

In [None]:
metadata_df['Species'].value_counts()[-2000:]

In [None]:
!ls data/images/data/101969

In [None]:
# export
train_dir = os.path.join(DATA_DIR, 'images_train/data')

In [None]:
# export


def filter_invalid_images(raw_maybe_invalid_classification_metadata_df):
    failed_paths = []
    for p in tqdm.tqdm(raw_classification_metadata_df['filename']):
        try:
            img = PIL.Image.open(p)
            if len(np.asarray(img).shape) < 3:
                failed_paths.append(p)
        except:
            failed_paths.append(p)   
    return raw_maybe_invalid_classification_metadata_df[~raw_maybe_invalid_classification_metadata_df['filename'].isin(failed_paths)]

def prepare_classification_metadata(metadata_df, target_col, basepath):
    def make_filename_from_parts(parts):
        return os.path.join(*(basepath, *map(str, parts))) + '.jpg'
    filename = 'MediaId'
    df = pd.DataFrame({'class': metadata_df[target_col], 'name': metadata_df[filename]})
    df['filename'] = metadata_df[['ClassId', filename]].agg(make_filename_from_parts, axis=1)
    return filter_invalid_images(df)


In [None]:
%%time

raw_classification_metadata_df = prepare_classification_metadata(metadata_df, 'Family', train_dir)

## Number of classes

In [None]:
raw_classification_metadata_df['class'].unique().shape

## Number of classes with more than 1000 examples

In [None]:
classification_metadata_df = raw_classification_metadata_df

In [None]:
raw_classification_metadata_df = get_classification_metadata_df(raw_classification_metadata_df)
raw_classification_metadata_df.shape

In [None]:
classification_metadata_df = raw_classification_metadata_df[~raw_classification_metadata_df['filename'].isin(failed_paths)]

In [None]:
train_classification_metadata_df, test_classification_metadata_df = model_selection.train_test_split(classification_metadata_df, test_size=10000, random_state=0)

In [None]:
image_gen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255, horizontal_flip=True)

sample_image_iterator = image_gen.flow_from_dataframe(train_classification_metadata_df, target_size=(224, 224))

n_classes = len(sample_image_iterator.class_indices)

In [None]:
batch = next(sample_image_iterator)

In [None]:
plt.imshow(batch[0][1]);

### This part will be used in script mode

In [None]:
#export
train_csv_path = os.path.join(DATA_DIR, 'train_metadata.csv')
test_csv_path = os.path.join(DATA_DIR, 'test_metadata.csv')
test_size = 10000
target_class = 'Family'


if __name__ == '__main__':
    metadata_df = get_metadata()
    raw_classification_metadata_df = prepare_classification_metadata(metadata_df, 'Family', train_dir)
    classification_metadata_df = get_classification_metadata_df(raw_classification_metadata_df)
    train_classification_metadata_df, test_classification_metadata_df = model_selection.train_test_split(classification_metadata_df, test_size=test_size, random_state=0)
    if os.path.exists(train_csv_path) or os.path.exists(test_csv_path):
        raise ValueError("some metadata file already exists, exiting")
    else:
        train_classification_metadata_df.to_csv(train_csv_path)
        test_classification_metadata_df.to_csv(test_csv_path)