In [1]:
import random
from random import sample
from tqdm.auto import tqdm
import pandas as pd
from collections import defaultdict
import numpy as np
import tensorflow as tf
import itertools
import math
import time

In [19]:
'''
Distance between 2 red tokens: 1
Count number of red tokens: 2
Find token that appears maximum time: 3
Compute sequence length: 4
Palindrome Sequence: 5
Sorted Sequence: 6
Sum: 7
MAx: 8
Min: 9

'''


def Generate_data(vocab , problem_id , max_seq_length = 512, number_data_points = 100000):

  '''This function generates the data based on our inputs

  vocab: vaocabulary of chars we are working with
  max_seq_length: max sequence length our single input can be of
  number_data_points: number of data points we want to generate
  problem_id: id of the problem for which data has to be generated'''

  data = {}

  #Generating for problem 1
  if(problem_id == 1):
    
    print('**************GENERATING DATA FOR PROBLEM 1 *************\n')

    indices = [[i, j] for i in range(max_seq_length) for j in range(i+1, max_seq_length)]

    def create_sequence_and_lables(indices):
        sequence = tf.reduce_sum(tf.one_hot(indices, depth=max_seq_length, dtype='int64'), axis=0)
        label = indices[1] - indices[0]
        return sequence, label

    dataset = tf.data.Dataset.from_tensor_slices(indices)
    dataset = dataset.shuffle(buffer_size=1024, reshuffle_each_iteration=False)
    dataset = dataset.map(create_sequence_and_lables)


    train_size = int(0.8 * len(indices))
    train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)


    train_dataset = train_dataset.batch(64, drop_remainder = True)
    test_dataset = test_dataset.batch(64, drop_remainder = True)

    return train_dataset, test_dataset

  #generating for problem 2
  if(problem_id == 2):
    print('**************GENERATING DATA FOR PROBLEM 2 *************\n')


    def int_to_sequence(i):
      return tf.convert_to_tensor(list(map(int, (format(i.numpy(), 'b')))))

    def map_int_to_sequence(i):
      sequence = tf.py_function(int_to_sequence, [i], tf.int32)
      label = tf.reduce_sum(sequence)
      # Adding 1 to differentiate between padding value and token ids
      return tf.add(sequence, 1), label

    dataset = tf.data.Dataset.from_tensor_slices(tf.range(100000))
    dataset = dataset.map(map_int_to_sequence, num_parallel_calls=tf.data.AUTOTUNE)

    train_size = int(0.8 * number_data_points)
    train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)

    train_dataset = train_dataset.padded_batch(512, padded_shapes=((None,), ()))
    test_dataset = test_dataset.padded_batch(512, padded_shapes=((None,), ()))


    return train_dataset, test_dataset



  '''data for problem 3'''
  if(problem_id == 3):

      print('**************GENERATING DATA FOR PROBLEM 3 *************\n')

      '''filling current i token in place of maximum # of 1 or 0 in the binary epresentation of i'''
    
      def make_data(i):
        
        b = format(i,'b')
        num_one = len([1 for char in b if char == '1'])
        num_zero = len([1 for char in b if char == '0'])

        if(num_zero > num_one):
          point = [vocab[i%len(vocab)] if char == '0' else vocab[((max_seq_length - len(b)) + index)%len(vocab)] for index,char in enumerate(b)]

        else:
          point = [vocab[i%len(vocab)] if char == '1' else vocab[((max_seq_length - len(b)) + index)%len(vocab)] for index,char in enumerate(b)]


        return tf.convert_to_tensor(point,'string')
        

      def gen_data(i):
        sequence = tf.py_function(make_data, [i], 'string')
        label = tf.gather(vocab, i%len(vocab))
        return sequence,label

      dataset = tf.data.Dataset.from_tensor_slices(tf.range(number_data_points))

      dataset = dataset.map(gen_data, num_parallel_calls=tf.data.AUTOTUNE)
      dataset = dataset.shuffle(buffer_size=1024, reshuffle_each_iteration=False)


      train_size = int(0.8 * number_data_points)
      train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)

      train_dataset = train_dataset.padded_batch(512, padded_shapes=((None,), ()))
      test_dataset = test_dataset.padded_batch(512, padded_shapes=((None,), ()))


      return train_dataset, test_dataset

  '''Data for problem 4'''
  if(problem_id == 4):

      print('**************GENERATING DATA FOR PROBLEM 4 *************\n')

      indices = [[0, j] for j in range(1, max_seq_length)]

      def create_sequence_and_lables(indices):
        sequence = tf.reduce_sum(tf.one_hot(tf.range(indices[0],indices[1]), depth=max_seq_length), axis=0)
        label = indices[1] - indices[0]
        return sequence, label

      dataset = tf.data.Dataset.from_tensor_slices(indices)
      dataset = dataset.shuffle(buffer_size=1024, reshuffle_each_iteration=False)
      dataset = dataset.map(create_sequence_and_lables)


      train_size = int(0.8 * len(indices))
      train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)


      train_dataset = train_dataset.batch(64)
      test_dataset = test_dataset.batch(64)

      return train_dataset, test_dataset

  '''data for problem 5'''
  if(problem_id == 5):

    print('**************GENERATING DATA FOR PROBLEM 5 *************\n')


    '''if i is even data point will be palindrome else non palindrome'''
    def make_palindrome(i):
      '''if len of binary representation of i is even then even lenght palindrome else odd length palindrome'''
      if(i%2 == 0):
          b = format(i,'b')
          l_b = len(b)
          b = '0'*(max_seq_length - len(b)) + b
          indices = [index%len(vocab) for index,char in enumerate(b) if char == '1']
          if(l_b %2 == 0):
            point = [vocab[i] for i in indices] + [vocab[i] for i in indices[: : -1]]

          else:
            point = [vocab[i] for i in indices] + [vocab[l_b]] + [vocab[i] for i in indices[: : -1]]
      else:
          b = format(i,'b')
          l_b = len(b)
          b = '0'*(max_seq_length - len(b)) + b
          indices = [index%len(vocab) for index,char in enumerate(b) if char == '1']
          point = [vocab[i] for i in indices] + [vocab[i] for i in [x*(x+4)%len(vocab) for x in indices]]

      return tf.convert_to_tensor(point,'string')

    def gen_data(i):
        sequence = tf.py_function(make_palindrome, [i], 'string')

        if(i %2 == 0):
          label = 1
        else:
          label = 0
        return sequence,label

    dataset = tf.data.Dataset.from_tensor_slices(tf.range(1,number_data_points))
    dataset = dataset.map(gen_data, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=1024, reshuffle_each_iteration=False)

    train_size = int(0.8 * number_data_points)
    train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)

    train_dataset = train_dataset.padded_batch(512, padded_shapes=((None,), ()))
    test_dataset = test_dataset.padded_batch(512, padded_shapes=((None,), ()))


    return train_dataset, test_dataset



  if(problem_id == 6):

    print('**************GENERATING DATA FOR PROBLEM 6 *************\n')


    def make_sort_data(j):

      start = j
      b = format(start, 'b')
      s = len([1 for char in format(start, 'b') if char == '1'])
      m = 26
      num = [s]
      elem = j%67

      for i in range(elem):
        a = len([1 for char in format(start,'b') if char == '0'])
        s = len([1 for char in format(start, 'b') if char == '1'])

        c = s+a


        next = (a*num[-1] + c) % m

        if(j % 2 == 0):

          #preparing sort data
          if(next >= num[-1]):
            num.append(next)

            start = num[-1]
          else:
            start = next

        else:
            #prepraing unsort data
            num.append(next)
            start = num[-1]

      return tf.convert_to_tensor(list(map(lambda x: vocab[x] ,num)), 'string')

    def gen_sort_data(i):

        sequence = tf.py_function(make_sort_data, [i], 'string')


        if(i %2 == 0):
          label = 1
        else:
          label = 0

        return sequence,label

    
    dataset = tf.data.Dataset.from_tensor_slices(tf.range(number_data_points))

    dataset = dataset.map(gen_sort_data, num_parallel_calls=tf.data.AUTOTUNE)

    dataset = dataset.shuffle(buffer_size=1024, reshuffle_each_iteration=False)

    train_size = int(0.8 * number_data_points)
    train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)

    train_dataset = train_dataset.padded_batch(512, padded_shapes=((None,), ()))
    test_dataset = test_dataset.padded_batch(512, padded_shapes=((None,), ()))


    return train_dataset, test_dataset



  if(problem_id == 7 or problem_id == 8 or problem_id == 9):

    print('**************GENERATING DATA FOR PROBLEM 7/8/9 *************\n')


    def make_number_data(i):
      return tf.convert_to_tensor([index%len(vocab) for index,char in enumerate('0'*(max_seq_length - len(format(i,'b'))) + format(i,'b')) if char == '1'],'int64')

    def gen_data(i):

      sequence = tf.py_function(make_number_data, [i], 'int64')

      if(problem_id == 7):
        label = tf.reduce_sum(sequence)
      elif(problem_id == 8):
        label = tf.reduce_max(sequence)
      else:
        label = tf.reduce_min(sequence)

      return sequence, label

    dataset = tf.data.Dataset.from_tensor_slices(tf.range(2**27,2**27 + number_data_points))
    dataset = dataset.map(gen_data, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=1024, reshuffle_each_iteration=False)

    train_size = int(0.8 * number_data_points)
    train_dataset, test_dataset = dataset.take(train_size), dataset.skip(train_size)

    train_dataset = train_dataset.padded_batch(512, padded_shapes=((None,), ()))
    test_dataset = test_dataset.padded_batch(512, padded_shapes=((None,), ()))


    return train_dataset, test_dataset


In [None]:
vocab_3 = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
vocab_4 = list(range(10))
train_dataset, test_dataset = Generate_data(vocab_3, 1, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 1 *************

(<tf.Tensor: shape=(64, 512), dtype=int64, numpy=
array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])>, <tf.Tensor: shape=(64,), dtype=int32, numpy=
array([454, 112, 201, 487, 437, 269, 383, 482, 442, 163,   5, 145, 204,
       142, 197, 509, 250, 147, 293, 253, 468,  58, 470, 282, 508,   6,
       168,  21, 124, 163, 488, 447, 341, 479,  58, 128, 484, 380, 265,
       344,  76, 278,  54, 182, 463, 131, 237,  80,  16,   6, 116, 185,
       261,  96, 293,  59,  38,  30, 348, 270, 275, 134, 377, 402],
      dtype=int32)>)


In [None]:
train_dataset, test_dataset = Generate_data(vocab_3, 2, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 2 *************

(<tf.Tensor: shape=(512, 9), dtype=int32, numpy=
array([[1, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [2, 1, 0, ..., 0, 0, 0],
       ...,
       [2, 2, 2, ..., 2, 1, 2],
       [2, 2, 2, ..., 2, 2, 1],
       [2, 2, 2, ..., 2, 2, 2]], dtype=int32)>, <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3,
       3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,
       3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2,
       2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5,
       3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
       5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3,
       2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4,
       4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
       3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,

In [None]:
train_dataset, test_dataset = Generate_data(vocab_3, 3, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 3 *************

(<tf.Tensor: shape=(512, 11), dtype=string, numpy=
array([[b'I', b'J', b'Y', ..., b'Q', b'Y', b''],
       [b'F', b'L', b'M', ..., b'', b'', b''],
       [b'B', b'B', b'L', ..., b'B', b'', b''],
       ...,
       [b'C', b'C', b'C', ..., b'', b'', b''],
       [b'D', b'D', b'', ..., b'', b'', b''],
       [b'X', b'J', b'K', ..., b'X', b'X', b'']], dtype=object)>, <tf.Tensor: shape=(512,), dtype=string, numpy=
array([b'Y', b'F', b'B', b'G', b'G', b'D', b'O', b'X', b'J', b'V', b'O',
       b'X', b'Y', b'Q', b'R', b'G', b'D', b'V', b'V', b'J', b'Y', b'Z',
       b'V', b'E', b'X', b'Q', b'K', b'J', b'D', b'Z', b'T', b'H', b'A',
       b'O', b'L', b'W', b'S', b'I', b'H', b'P', b'C', b'W', b'B', b'H',
       b'I', b'R', b'L', b'E', b'C', b'W', b'Q', b'C', b'K', b'B', b'T',
       b'B', b'R', b'C', b'U', b'C', b'S', b'U', b'R', b'H', b'H', b'D',
       b'K', b'K', b'V', b'K', b'Z', b'V', b'H', b'K', b'N', b'G', b'R',
       b'K', b'Q'

In [None]:
train_dataset, test_dataset = Generate_data(vocab_3, 4, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 4 *************

(<tf.Tensor: shape=(64, 512), dtype=float32, numpy=
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(64,), dtype=int32, numpy=
array([329, 376, 277, 475, 486, 175, 332,  14,  97, 223, 434, 276, 433,
       229,  10, 487, 384, 303, 113, 305, 284, 427,  32, 425, 159, 213,
       437,  70, 348, 304, 371, 115, 438, 356, 400, 450,  89, 235,   2,
       311,  46, 155, 289,  94, 168, 386, 203, 123, 441,  56, 200, 185,
       110,  53, 187, 117, 147, 183,  36, 162, 105, 217,  47,  17],
      dtype=int32)>)


In [None]:
train_dataset, test_dataset = Generate_data(vocab_3, 5, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 5 *************

(<tf.Tensor: shape=(512, 20), dtype=string, numpy=
array([[b'I', b'J', b'L', ..., b'', b'', b''],
       [b'K', b'N', b'O', ..., b'', b'', b''],
       [b'I', b'J', b'K', ..., b'', b'', b''],
       ...,
       [b'H', b'K', b'L', ..., b'', b'', b''],
       [b'H', b'K', b'N', ..., b'', b'', b''],
       [b'I', b'L', b'O', ..., b'', b'', b'']], dtype=object)>, <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 

In [None]:
train_dataset, test_dataset = Generate_data(vocab_3, 6, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 6 *************

(<tf.Tensor: shape=(512, 67), dtype=string, numpy=
array([[b'F', b'J', b'W', ..., b'', b'', b''],
       [b'H', b'K', b'Y', ..., b'', b'', b''],
       [b'G', b'I', b'K', ..., b'', b'', b''],
       ...,
       [b'H', b'X', b'C', ..., b'', b'', b''],
       [b'E', b'I', b'K', ..., b'', b'', b''],
       [b'D', b'F', b'I', ..., b'', b'', b'']], dtype=object)>, <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 

In [None]:
train_dataset, test_dataset = Generate_data(vocab_4, 7, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 7/8/9 *************

(<tf.Tensor: shape=(512, 11), dtype=int64, numpy=
array([[4, 2, 3, ..., 1, 0, 0],
       [4, 2, 3, ..., 0, 0, 0],
       [4, 2, 3, ..., 0, 0, 0],
       ...,
       [4, 1, 3, ..., 0, 0, 0],
       [4, 2, 3, ..., 0, 0, 0],
       [4, 1, 7, ..., 0, 0, 0]])>, <tf.Tensor: shape=(512,), dtype=int64, numpy=
array([38, 30, 36, 37, 26, 28, 22, 40, 32, 12, 18, 28, 22,  4, 32, 31, 29,
       31, 15, 32, 20, 18, 18, 36, 17, 29, 18, 16, 22, 27, 39, 23, 26, 23,
       26, 34, 13, 23, 31, 36, 12, 24, 28, 24, 17, 22, 20, 27, 19, 15, 31,
       31, 38, 27, 21, 23, 32, 19, 20, 25, 29, 29, 15, 37, 27, 20, 17, 27,
       36, 38, 25, 21, 32, 48, 26, 22, 32, 34,  5, 29, 13, 28, 36, 31, 35,
       24, 24, 32, 43, 40, 22, 14, 43, 37, 11, 31, 25, 26, 30, 26, 21, 19,
       14, 28, 22, 46, 39, 12, 31, 19, 27, 19, 41, 34, 33, 36, 27, 40, 32,
       19, 29, 33, 28, 22, 20, 27, 12, 21, 11, 25, 31, 23, 27, 36, 14, 29,
       28, 35, 31, 10, 36, 21, 40,

In [None]:
train_dataset, test_dataset = Generate_data(vocab_4, 8, 512)
for batch in train_dataset:
  print(batch)
  break


**************GENERATING DATA FOR PROBLEM 7/8/9 *************

(<tf.Tensor: shape=(512, 11), dtype=int64, numpy=
array([[4, 5, 6, ..., 0, 0, 0],
       [4, 2, 4, ..., 0, 0, 0],
       [4, 3, 4, ..., 0, 0, 0],
       ...,
       [4, 1, 3, ..., 0, 0, 0],
       [4, 1, 3, ..., 0, 1, 0],
       [4, 2, 5, ..., 0, 0, 0]])>, <tf.Tensor: shape=(512,), dtype=int64, numpy=
array([6, 4, 9, 8, 9, 9, 9, 8, 9, 8, 9, 7, 9, 9, 8, 9, 4, 7, 9, 9, 6, 9,
       7, 5, 8, 9, 9, 4, 9, 8, 7, 7, 8, 9, 9, 5, 8, 9, 8, 9, 7, 9, 8, 9,
       9, 9, 7, 9, 8, 9, 6, 5, 9, 7, 8, 9, 8, 8, 8, 9, 9, 6, 7, 8, 9, 9,
       9, 9, 7, 9, 9, 9, 9, 8, 9, 9, 9, 8, 8, 7, 9, 8, 9, 9, 9, 9, 9, 9,
       7, 8, 7, 9, 9, 6, 4, 8, 9, 8, 8, 8, 6, 4, 9, 8, 9, 9, 9, 9, 8, 9,
       9, 9, 8, 9, 8, 9, 8, 8, 9, 8, 9, 9, 8, 9, 8, 6, 9, 8, 8, 6, 9, 8,
       8, 4, 9, 9, 6, 7, 9, 9, 9, 8, 7, 7, 9, 9, 9, 8, 6, 8, 9, 9, 9, 9,
       9, 6, 9, 9, 9, 9, 9, 9, 7, 7, 8, 9, 9, 4, 9, 9, 8, 7, 9, 9, 5, 9,
       9, 8, 5, 8, 9, 6, 9, 9, 4, 9, 9, 9, 9, 8, 5

In [None]:
train_dataset, test_dataset = Generate_data(vocab_4, 9, 512)
for batch in train_dataset:
  print(batch)
  break

**************GENERATING DATA FOR PROBLEM 7/8/9 *************

(<tf.Tensor: shape=(512, 11), dtype=int64, numpy=
array([[4, 2, 3, ..., 0, 0, 0],
       [4, 3, 6, ..., 0, 0, 0],
       [4, 3, 5, ..., 0, 0, 0],
       ...,
       [4, 3, 6, ..., 0, 0, 0],
       [4, 1, 3, ..., 0, 0, 0],
       [4, 2, 3, ..., 0, 0, 0]])>, <tf.Tensor: shape=(512,), dtype=int64, numpy=
array([2, 3, 3, 1, 1, 0, 0, 4, 1, 0, 0, 1, 4, 0, 0, 3, 0, 2, 1, 1, 0, 0,
       0, 4, 1, 3, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 3, 4, 0, 0, 1, 0, 2, 0, 0, 2, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 4, 0, 2, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 4, 0, 1, 0, 1, 3, 2, 0, 1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 2, 1, 0, 1, 1, 0, 2,
       2, 2, 0, 0, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 1, 4, 2, 0, 2, 4, 0,
       0, 0, 1, 0, 4, 2, 0, 1, 0, 2, 0, 0, 0, 3, 0, 0, 1, 1, 0, 0, 0, 2,
       1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0

Modelling

In [3]:
from tensorflow.keras.layers import Embedding, Dense, Input
from tensorflow.keras.models import Model

In [20]:
vocab_3 = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
vocab_4 = list(range(10))
train_dataset, test_dataset = Generate_data(vocab_3, 1, 512)
frst_batch = None
for batch in train_dataset:
  frst_batch = batch
  print(batch)
  break

for batch in train_dataset:
  if(batch[0].shape[0] != 64):
    print('here')
frst_batch[0].shape

**************GENERATING DATA FOR PROBLEM 1 *************

(<tf.Tensor: shape=(64, 512), dtype=int64, numpy=
array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])>, <tf.Tensor: shape=(64,), dtype=int32, numpy=
array([411,   7, 322, 219, 244,  19, 494, 490, 101, 129, 114,  21, 445,
       313,  37,  23, 233, 178, 280, 193, 368, 115, 476, 497, 484, 186,
       134, 227, 138,  83, 124, 147, 452, 237, 163, 165, 246, 264, 284,
       215, 272, 422, 383, 322, 367, 472, 435, 407, 136, 198,  29, 197,
         1,  46,  85, 401,  57, 156, 353, 161, 455, 326,  22, 348],
      dtype=int32)>)


TensorShape([64, 512])

In [5]:
config = {
    'num_heads' : 1,
    'num_layers': 1,
    'emb_dim': 128,
    'seq_length': 512,
    'vocab_size': 2,
    'head_size': 64, #size of single dense layer head
    'pos_embedding': True, #True or False weather to learn positional embeddings
    'agg_method': 'TOKEN' #one of TOKEN or SUM
}


In [6]:

class Encodings(tf.keras.layers.Layer):

  '''
    This class takes input tokens and return the embeddings with class token appended and positional embeddings added
  '''

  def __init__(self, config):

    super().__init__()
    self.embedding_dim = config['emb_dim']
    self.seq_length = config['seq_length']
    self.vocab_size = config['vocab_size']
    self.pos_embedding_flag = config['pos_embedding']
    self.embedding = Embedding(self.vocab_size, self.embedding_dim, input_length = self.seq_length)
    self.pos_embedding = Embedding(self.seq_length + 1, self.embedding_dim)
    
  def build(self, input_shape):
    batch_size = 64
    self.class_tokens = Embedding(batch_size, self.embedding_dim)
    self.class_tokens = self.class_tokens(tf.range(start=0, limit=batch_size, delta=1))


  def call(self, batch):
    embedding_out = tf.concat([self.embedding(batch), tf.expand_dims(self.class_tokens, axis=1)], axis=1)

    if(self.pos_embedding_flag):
      pos_embedding = self.pos_embedding(tf.range(start = 0, limit = self.seq_length + 1, delta=1))
      embedding_out = embedding_out + pos_embedding


    return embedding_out


In [7]:
embeddings = Encodings(config)(frst_batch[0])
embeddings.shape

TensorShape([64, 513, 128])

In [8]:
class Attention(tf.keras.layers.Layer):

  '''
    This class implements the Attention layer mechanism and returns the attention output and attention scores for each attention head
  '''

  def __init__(self,config):
    super().__init__()

    self.num_att_heads = config['num_heads']
    self.attention_head_size = int(config['emb_dim'] / self.num_att_heads)
    self.all_head_size = self.num_att_heads * self.attention_head_size

    self.query = Dense(self.all_head_size)
    self.key = Dense(self.all_head_size)
    self.value = Dense(self.all_head_size)
    self.out = Dense(config['emb_dim'])

  def call(self, hidden_states):

    #getting the query , key and value vectors
    mixed_query_layer = self.query(hidden_states)
    mixed_key_layer = self.key(hidden_states)
    mixed_value_layer = self.value(hidden_states)

    hidden_states_shape = tf.shape(hidden_states)

    #Dividing query keay and value vectors between given number of attention heads
    query_layer = tf.reshape(mixed_query_layer, shape = (hidden_states_shape[0], self.num_att_heads, 
                                 hidden_states_shape[1],
                                 self.attention_head_size))
    

    key_layer = tf.reshape(mixed_key_layer, shape = (hidden_states_shape[0], self.num_att_heads, 
                                 hidden_states_shape[1],
                                 self.attention_head_size))
    
    value_layer = tf.reshape(mixed_value_layer, shape = (hidden_states_shape[0], self.num_att_heads, 
                                 hidden_states_shape[1],
                                 self.attention_head_size))
    
    #getting the attention scores
    attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0,1,3,2]))

    attention_scores = attention_scores / math.sqrt(self.attention_head_size)

    attention_probs = tf.nn.softmax(attention_scores, axis=-1)

    #getting the attention output
    context_layer = tf.matmul(attention_probs, value_layer)
    context_layer = tf.reshape(context_layer, shape=( hidden_states_shape[0],
                                                         hidden_states_shape[1],
                                                         hidden_states_shape[2]))
    
    att_output = self.out(context_layer)

    return att_output, attention_probs




In [9]:
att_output, att_scores = Attention(config)(embeddings)

In [10]:
att_output.shape, att_scores.shape, np.sum(att_scores[0,0,54]) , att_output[:,0,:].shape, tf.math.reduce_sum(att_output[:,1:,:], axis = 1).shape


(TensorShape([64, 513, 128]),
 TensorShape([64, 1, 513, 513]),
 1.0,
 TensorShape([64, 128]),
 TensorShape([64, 128]))

In [11]:
class _Model(tf.keras.layers.Layer):

  def __init__(self, config):
    super().__init__()

    self.encodings = Encodings(config)
    self.attention = Attention(config)
    self.agg_method = config['agg_method']
    self.head_dim = config['head_size']
    self.head = Dense(self.head_dim)


  def call(self, input):
    op = self.encodings(input)
    att_op, att_scores = self.attention(op)

    if(self.agg_method == 'TOKEN'):
      op = att_op[:,0,:]
    else:
      op = tf.math.reduce_sum(att_op[:, 1:, :], axis = 1)

    op = self.head(op)


    return op, att_scores




In [12]:
op, att_scores = _Model(config)(frst_batch[0])
op.shape, att_scores.shape

(TensorShape([64, 64]), TensorShape([64, 1, 513, 513]))

In [13]:
input_shape = (512)
input = Input(input_shape)
op, att_scores = _Model(config)(input)
output = Dense(1, activation='linear')(op)

model = Model(inputs = input, outputs=output)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 private__model_1 (_Model)   ((64, 64),                140224    
                              (64, 1, 513, 513))                 
                                                                 
 dense_14 (Dense)            (64, 1)                   65        
                                                                 
Total params: 140,289
Trainable params: 140,289
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(optimizer= 'adam', loss='mean_squared_error', metrics = ['mean_squared_error'])
model.fit(train_dataset, epochs = 1, validation_data = test_dataset, steps_per_epoch = len(train_dataset))



<keras.callbacks.History at 0x7f0839e8d290>