<a href="https://colab.research.google.com/github/harsh31415926/Deep-Learning/blob/main/YT9_input_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
import numpy as np
import tensorflow as tf


In [48]:
daily_sales_numbers = [21, 22, -108, 31, -1, 32, 34,31]

tf_dataset = tf.data.Dataset.from_tensor_slices(daily_sales_numbers)
tf_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [49]:
for sales in tf_dataset:
  print(sales.numpy())

21
22
-108
31
-1
32
34
31


In [50]:
data = tf_dataset.filter(lambda x : x>0)
for sales in data.as_numpy_iterator():
  print(sales)

21
22
31
32
34
31


In [51]:
data2 = tf_dataset.map(lambda x:x*100)

for sales in data2.as_numpy_iterator():
  print(sales)


2100
2200
-10800
3100
-100
3200
3400
3100


In [52]:
data3 = tf_dataset.shuffle(3)

for sales in data3.as_numpy_iterator():
  print(sales)


-108
31
22
32
34
21
31
-1


In [53]:
data4 = tf_dataset.batch(2)

for sales in data4.as_numpy_iterator():
  print(sales)


[21 22]
[-108   31]
[-1 32]
[34 31]


# **We can all do this in one line**

In [54]:
all_data = tf_dataset.map(lambda y:y*6).shuffle(3).filter(lambda x:x>0).batch(3)

for sales in all_data.as_numpy_iterator():
  print(sales)


[126 132 186]
[192 204 186]


# Implementing Advance pipeline

In [55]:
import tensorflow as tf
import time

In [56]:
tf.__version__

'2.19.0'

In [57]:
class FileDataset (tf.data.Dataset):
  def read_files_in_batches(num_samples):
    time.sleep(0.03)

    for sample_idx in range(num_samples):
      time.sleep(0.015)
      yield (sample_idx,)

  def __new__(cls , num_samples = 3):
    return tf.data.Dataset.from_generator(
        cls.read_files_in_batches,
        output_signature = tf.TensorSpec(shape = (1,), dtype = tf.int64),
        args =(num_samples,)
    )

In [58]:
def benchmark(dataset , num_epochs= 2 ):
  for epoch_num in range(num_epochs):
    for sample in dataset :
      time.sleep(0.01)

In [59]:
%%timeit
benchmark(FileDataset())

277 ms ± 32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [60]:
%%timeit
benchmark(FileDataset().prefetch(3))

396 ms ± 50.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [61]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

396 ms ± 51.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
dataset = tf.data.Dataset.range(5)
for d in dataset :
  print(d.numpy())

0
1
2
3
4


In [63]:
dataset = dataset.map(lambda x : x**4)
for  d in dataset:
  print(d.numpy())

0
1
16
81
256


In [64]:
dataset = dataset.cache()

list(dataset.as_numpy_iterator())
# It is not computing the ^4 again and again it is just using the cache function

[np.int64(0), np.int64(1), np.int64(16), np.int64(81), np.int64(256)]

In [66]:
def mapped_function(s):
  tf.py_function(lambda : time.sleep(0.03) , [] , ())
  return s

In [72]:
%%timeit -n1 -r1
benchmark(FileDataset().map(mapped_function),50)


10.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [73]:
%%timeit -n1 -r1
benchmark(FileDataset().map(mapped_function).cache(),50)



1.83 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Here we can see that using cache can reduce the timing by more than 5  times