# Paquetes

In [1]:
import os
import pathlib

import tensorflow as tf
import tensorflow_hub as tfhub

import time
import split_folders #pip install split-folders -- https://github.com/jfilter/split-folders

In [2]:
#tf.enable_eager_execution()

# Variables

In [24]:
# Modelo importado
TFHUB_CACHE_DIR = './models/tfhub'
#os.environ['TFHUB_CACHE_DIR'] = TFHUB_CACHE_DIR
#os.makedirs(TFHUB_CACHE_DIR, exist_ok=True)

# Ruta donde estan los archivos
data_path = pathlib.Path('/Users/hdla/Documents/Proyectos/ML/MILE_data/imagesclassified')
train_path = pathlib.Path('/Users/hdla/Downloads/data/train')
test_path = pathlib.Path('/Users/hdla/Downloads/data/test')
val_path = pathlib.Path('/Users/hdla/Downloads/data/val')

# Divide la informacion de la carpeta dev en train-test-val
split_folders.ratio(data_path, output='/Users/hdla/Downloads/data/', seed=1337, ratio=(.8, .1, .1))

# Datos

Organizacion de features and labels. Las facturas se encuentran clasificadas en carpetas por cada nit de proveedor

In [25]:
def list_feature_label(path):
    # Ruta de las facturas
    facturas_root = [str(_) for _ in list(path.glob('*/*'))]

    # maestro proveedores(nit)
    proveedores = [_.name for _ in path.glob('*/')]

    # maestro Proveedores(nit) con indice
    proveedores_index = dict((_, idx) for idx,_ in enumerate(proveedores))

    # Listado del indice en orden de cada factura
    facturas_label = [proveedores_index[pathlib.Path(_).parent.name] for _ in facturas_root]
    
    return facturas_root, facturas_label

features_train, labels_train = list_feature_label(train_path)
features_test, labels_test = list_feature_label(test_path)
features_val, labels_val = list_feature_label(train_path)
features_main = [_.name for _ in data_path.glob('*/')]

In [17]:
#plt.display(plt.Image(facturas[369]))

Importacion del modulo de transfer learning para imagenes

In [18]:
def transfer_model():
    if not os.path.isdir(TFHUB_CACHE_DIR):
        tfh_module = tfhub.Module("https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2")
        return tfh_module
      #tfh_module = hub.Module("https://tfhub.dev/google/imagenet/mobilenet_v2_050_192/feature_vector/2")
      #tfh_module = hub.Module("https://tfhub.dev/google/inaturalist/inception_v3/feature_vector/1")
      #tfh_module = tfhub.Module("https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2")
    else:
        tfh_module = tfhub.Module(os.path.join(TFHUB_CACHE_DIR, 'adfe0cf8d843e3588bfb9602e32a718b12212904'))
        return tfh_module

IMAGE_SIZE = tfhub.get_expected_image_size(transfer_model())

Preprocesamiento de las imagenes

In [19]:
def load_and_preprocess_image(path):
    image = tf.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image /= 255.0
    return image

# Borrar cuando no es eager execute
#ima, la = load_and_preprocess_image(facturas_root[98], facturas_label[98])
#plt.imshow(ima)

Funciones de data ingest

In [20]:
# Ingest data
def inputfn_train():
    image = tf.data.Dataset.from_tensor_slices(features_train)
    image = image.map(load_and_preprocess_image)
    
    label = tf.data.Dataset.from_tensor_slices(tf.cast(labels_train, tf.int64))
    
    ds_image_label = tf.data.Dataset.zip((image, label))    
    ds_image_label = ds_image_label.shuffle(buffer_size=len(features_train))
    ds_image_label = ds_image_label.repeat(None) # Repeticion infinita, son los epochs
    ds_image_label = ds_image_label.batch(32)
    ds_image_label = ds_image_label.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return ds_image_label

def inputfn_eval():
    image = tf.data.Dataset.from_tensor_slices(features_val)
    image = image.map(load_and_preprocess_image)
    
    label = tf.data.Dataset.from_tensor_slices(tf.cast(labels_val, tf.int64))
    
    ds_image_label = tf.data.Dataset.zip((image, label))    
    ds_image_label = ds_image_label.repeat(1) # Repeticion infinita, son los epochs
    ds_image_label = ds_image_label.batch(32)
    ds_image_label = ds_image_label.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return ds_image_label

def inputfn_pred():
    pass

def inputfn_serving():
    pass
    

# Modelo

In [21]:
def model_fn(features, labels, mode):
    # Transfer learning
    tfh_module = transfer_model()

    transformed_features = tfh_module(features)
    logits = tf.layers.dense(transformed_features, len(features_main)) 
    probabilities = tf.nn.softmax(logits)
        
    if (mode != tf.estimator.ModeKeys.PREDICT):
        one_hot_labels = tf.one_hot(labels, len(features_main)) #tf.one_hot(len(proveedores))
        loss = tf.losses.softmax_cross_entropy(one_hot_labels, logits)
        optimizer = tf.train.AdamOptimizer()

        train_op = tf.contrib.training.create_train_op(loss, optimizer)
        accuracy = tf.metrics.accuracy(labels, tf.argmax(probabilities, axis=-1))
        metrics = {'acc': accuracy}
    else:
        loss = optimizer = train_op = metrics = None
    
    model = tf.estimator.EstimatorSpec(mode=mode,
                                       loss=loss,
                                       train_op=train_op,
                                       predictions={'proba': probabilities, 'class': tf.argmax(probabilities, axis=-1)},
                                       eval_metric_ops=metrics)
    
    return model

# Entrenamiento

In [23]:
run_config = tf.estimator.RunConfig(model_dir='./models/trained',
                                   save_summary_steps=10,
                                   save_checkpoints_steps=10,
                                   log_step_count_steps=10)

model = tf.estimator.Estimator(model_fn=model_fn,
                              config=run_config)

train_spec = tf.estimator.TrainSpec(input_fn=inputfn_train,
                                   max_steps=150)

eval_spec = tf.estimator.EvalSpec(input_fn=inputfn_eval)

start = time.time()
out = tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
print(out)
end = time.time()
print(end - start)

W0707 23:09:40.082838 4522362304 deprecation.py:323] From /Users/hdla/Documents/Proyectos/ML/venv/MILE/lib/python3.7/site-packages/tensorflow/python/training/saver.py:960: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.


({'acc': 0.9628099, 'loss': 0.17549129, 'global_step': 150}, [])
393.11907601356506


In [16]:
tensorboard --logdir './models/trained'

SyntaxError: invalid syntax (<ipython-input-16-0f863f0995a7>, line 1)