# Analisis de datos

En este notebook convierte los datos descargados en el notebook 0_preprocessing.ipynb en tfrecords para usar tensorflow.

# Prerequisitos

Es necesario ejecutar el notebook 0_preprocessing.ipynb para crear los ejemplos en formato npy antes de ejecutar este notebook.

# Librerias

In [1]:
import os
import random

import numpy as np

os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
import json
import pprint
import tensorflow as tf
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm




# Constantes

In [2]:
# Carpeta para guardar ejemplos estandarizados y limpios
CLEANED_DATASET_FOLDER = "../../cleaned_data"
# Tensorflow dataset folder
TF_DATASET_FOLDER = "../tf_data"
# Chunk size
CHUNK_SIZE = 50

# Funciones

## Generacion de tfrecords

In [3]:
def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()])
    )

def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))


def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

In [4]:
def create_tf_example(example):
    feature = {
        "song_id": int64_feature(example["song_id"]),
        "genre_id": int64_feature(example["genre_id"]),
        "time_series": float_feature_list(example["time_series"])
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

## Iteracion sobre npy a tfrecord

In [5]:
def list_npy_files():
    npy_files = [
        os.path.join(CLEANED_DATASET_FOLDER, npy_file)
        for npy_file in os.listdir(CLEANED_DATASET_FOLDER) 
    ]
    random.shuffle(npy_files)
    return npy_files

In [6]:
def divide_chunks(files, chunk_size): 
    return {
        f"chunk_{i}": files[i:i + chunk_size]
        for i in range(0, len(files), chunk_size)
    }

In [7]:
def chunk_to_tf_record(chunk_id, chunk_files):
    with tf.io.TFRecordWriter(
        os.path.join(TF_DATASET_FOLDER, f"{chunk_id}.tfrecord")
    ) as writer:
        for file in chunk_files:
            example = np.load(file, allow_pickle=True).tolist()
            tf_example = create_tf_example(example)
            writer.write(tf_example.SerializeToString())

In [8]:
def npy_dataset_to_tfrecords():
    if not os.path.exists(TF_DATASET_FOLDER):
        os.mkdir(TF_DATASET_FOLDER)
    npy_files = list_npy_files()
    chunks = divide_chunks(npy_files, CHUNK_SIZE)
    _ = Parallel(n_jobs=-1, verbose=10)(
        delayed(chunk_to_tf_record)(
            chunk_id,
            chunk_files
        )
        for chunk_id, chunk_files in chunks.items()
    )

## Lectura de dataset

In [9]:
def load_dataset(tfrecord_folder):
    tf_record_files = [
        os.path.join(tfrecord_folder, file)
        for file in
        os.listdir(tfrecord_folder)
    ]
    return tf.data.TFRecordDataset(tf_record_files).map(parse_tfrecord_fn)

In [10]:
def parse_tfrecord_fn(tf_example):
    feature_description = {
        "song_id": tf.io.FixedLenFeature([], tf.int64),
        "genre_id": tf.io.FixedLenFeature([], tf.int64),
        "time_series": tf.io.VarLenFeature(tf.float32)
    }
    example = tf.io.parse_single_example(tf_example, feature_description)
    example["time_series"] = tf.sparse.to_dense(example["time_series"])
    return example

# Generacion de tfrecords

In [11]:
os.getcwd()

'd:\\WIndowsRepositories\\project_mml\\dataset\\notebooks'

In [12]:
npy_dataset_to_tfrecords()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  14 | elapsed:   53.6s remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   5 out of  14 | elapsed:   54.0s remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   7 out of  14 | elapsed:   54.3s remaining:   54.3s
[Parallel(n_jobs=-1)]: Done   9 out of  14 | elapsed:   54.4s remaining:   30.2s
[Parallel(n_jobs=-1)]: Done  11 out of  14 | elapsed:   54.7s remaining:   14.8s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   55.0s finished


# Lectura de tfrecords como dataset

In [13]:
dataset = load_dataset(TF_DATASET_FOLDER)

In [14]:
dataset

<_MapDataset element_spec={'time_series': TensorSpec(shape=(None,), dtype=tf.float32, name=None), 'genre_id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'song_id': TensorSpec(shape=(), dtype=tf.int64, name=None)}>

In [15]:
for example in dataset.take(3):
  print(example)

{'time_series': <tf.Tensor: shape=(1321967,), dtype=float32, numpy=
array([-9.6101727e-10, -1.1001114e-08, -1.3324586e-08, ...,
       -1.0322432e-01, -1.0536708e-01, -1.0047625e-01], dtype=float32)>, 'genre_id': <tf.Tensor: shape=(), dtype=int64, numpy=5>, 'song_id': <tf.Tensor: shape=(), dtype=int64, numpy=3796>}
{'time_series': <tf.Tensor: shape=(1321967,), dtype=float32, numpy=
array([-3.22965810e-09, -4.43648602e-08, -1.15803765e-07, ...,
       -4.44202453e-01, -4.89289045e-01, -5.32377601e-01], dtype=float32)>, 'genre_id': <tf.Tensor: shape=(), dtype=int64, numpy=5>, 'song_id': <tf.Tensor: shape=(), dtype=int64, numpy=18028>}
{'time_series': <tf.Tensor: shape=(1321967,), dtype=float32, numpy=
array([-0.05667935, -0.05044877, -0.02995109, ...,  0.04404794,
        0.0305051 ,  0.01628765], dtype=float32)>, 'genre_id': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'song_id': <tf.Tensor: shape=(), dtype=int64, numpy=21426>}


2024-07-27 22:05:09.897613: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
