In [2]:
import os
import pandas as pd
import numpy as np
import time
np.set_printoptions(suppress=True)
from data import tf_data, read_stations

import tensorflow as tf 
from tensorflow.keras.layers import Dense, MultiHeadAttention, LayerNormalization
from tensorflow.keras import Model

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
layer = MultiHeadAttention(num_heads=1, key_dim=2)
target = tf.keras.Input(shape=[8, 16])
source = tf.keras.Input(shape=[4, 16])
output_tensor, weights = layer(target, source,
                               return_attention_scores=True)
print(output_tensor.shape)

print(weights.shape)
# layer.weights

(None, 8, 16)
(None, 1, 8, 4)


In [4]:
layer = MultiHeadAttention(num_heads=1, key_dim=64)
target = tf.keras.Input(shape=[1, 147, 10])
source = tf.keras.Input(shape=[18, 147, 10])
output_tensor, weights = layer(target, source,
                               return_attention_scores=True)
print(output_tensor.shape)

print(weights.shape)

(None, 1, 147, 10)
(None, 1, 1, 147, 18, 147)


In [5]:
# layer.weights

In [6]:
transactions_path = '../data/clean_transactions.csv'
stations_path = '../data/clean_stations_database_v2.csv'
aggregation = "15-mins"
max_stations = None
max_transactions = 10000
train_date = '2015-12-01'

train_data, test_data, metadata = tf_data(
        transactions_path,
        stations_path,
        aggregation,
        train_date,
        max_transactions,
        max_stations)

Train features shape: (8206, 18, 147)
Test features shape: (730, 18, 147)

Train time_embeddings shape: (8206, 19, 8)
Test time_embeddings shape: (730, 19, 8)

Train spatial_embeddings shape: (8206, 147, 2)
Test spatial_embeddings shape: (730, 147, 2)

Train labels shape: (8206, 147)
Test labels shape: (730, 147)

(8206,)
(730,)


In [7]:
train_data['features'][0,:,2]

<tf.Tensor: shape=(18,), dtype=float64, numpy=
array([  0.,   0.,   0.,   0., 242., 215., 212., 172., 221., 295., 250.,
       205.,  80.,  83., 128.,  72.,  54.,  24.])>

In [8]:
train_data['labels'][0,:1]

<tf.Tensor: shape=(1,), dtype=float64, numpy=array([0.])>

In [9]:
train_data.keys()

dict_keys(['features', 'time_embeddings', 'spatial_embeddings', 'labels'])

In [10]:
train_data['time_embeddings'][0,-1:]

<tf.Tensor: shape=(1, 8), dtype=float64, numpy=
array([[-0.67638112, -0.73655181,  0.93087375, -0.36534102,  0.8660254 ,
         0.5       ,  0.        ,  1.        ]])>

In [11]:
test_data['spatial_embeddings']

<tf.Tensor: shape=(730, 147, 2), dtype=float64, numpy=
array([[[-4.40028658, -4.84577421],
        [-4.78932063, -4.95052437],
        [-4.08125421, -4.77791487],
        ...,
        [ 2.06380647, -3.18630475],
        [ 2.00396156, -3.45237508],
        [ 2.2462913 ,  3.51326342]],

       [[-4.40028658, -4.84577421],
        [-4.78932063, -4.95052437],
        [-4.08125421, -4.77791487],
        ...,
        [ 2.06380647, -3.18630475],
        [ 2.00396156, -3.45237508],
        [ 2.2462913 ,  3.51326342]],

       [[-4.40028658, -4.84577421],
        [-4.78932063, -4.95052437],
        [-4.08125421, -4.77791487],
        ...,
        [ 2.06380647, -3.18630475],
        [ 2.00396156, -3.45237508],
        [ 2.2462913 ,  3.51326342]],

       ...,

       [[-4.40028658, -4.84577421],
        [-4.78932063, -4.95052437],
        [-4.08125421, -4.77791487],
        ...,
        [ 2.06380647, -3.18630475],
        [ 2.00396156, -3.45237508],
        [ 2.2462913 ,  3.51326342]],

       [

In [24]:
class MinMax(tf.keras.layers.Layer):
    def __init__(self,*, min_value = None, max_value = None, range_values = None):
        super().__init__()

        self.min_x = min_value
        self.max_x = max_value
        self.min_t = 0
        self.max_t = 1
        
        if range_values: 
            self.min_t = range_values[0]
            self.max_t = range_values[1]
        
    def adapt(self, data):
        self.min_x = tf.math.reduce_min(data).numpy()
        self.max_x = tf.math.reduce_max(data).numpy()

    def call(self, x, reverse = False):
        
        #Raise error is min and max are none. 
        if reverse: 
            x = (((x - self.min_t)*(self.max_x - self.min_x))/(self.max_t - self.min_t)) 
            x = x + self.min_x
        else: 
            x = (x - self.min_x)/(self.max_x - self.min_x)
            x = x * (self.max_t - self.min_t) + self.min_t #Range values
        return x

In [38]:
minmax_layer = MinMax(range_values = (0,1))
minmax_layer.adapt((train_data['features']))

In [39]:
r = minmax_layer(train_data['features'])

In [40]:
r[89,:,0]

<tf.Tensor: shape=(18,), dtype=float32, numpy=
array([0.12934545, 0.10162856, 0.6974632 , 0.32821798, 0.18321328,
       0.1807078 , 0.16630128, 0.18665831, 0.17945506, 0.19245224,
       0.16708425, 0.18383965, 0.04635139, 0.05261509, 0.07422487,
       0.06482931, 0.08001879, 0.07720012], dtype=float32)>

In [41]:
minmax_layer(r[89,:,0], reverse = True)

<tf.Tensor: shape=(18,), dtype=float32, numpy=
array([ 826.     ,  649.     , 4454.     , 2096.     , 1170.     ,
       1154.     , 1062.     , 1192.     , 1146.     , 1229.     ,
       1067.     , 1174.     ,  296.     ,  336.     ,  474.     ,
        414.     ,  510.99997,  492.99997], dtype=float32)>

In [42]:
minmax_layer.min_x

0.0

In [43]:
minmax_layer.max_x

6386.0

In [15]:
class TimeSpaceEmbedding(tf.keras.layers.Layer):
    def __init__(self, *args):
        super().__init__()

        self.concat = tf.keras.layers.Concatenate(axis = 3)
        self.add = tf.keras.layers.Add()

    def build(self, input_shape):
        window_size = input_shape[0][1]
        num_station = input_shape[1][1]

        self.t_embedding = tf.keras.Sequential([
            tf.keras.layers.Flatten(),
            tf.keras.layers.RepeatVector(num_station),
            tf.keras.layers.Reshape(target_shape = (num_station,window_size,8)),
            tf.keras.layers.Permute(dims = (2,1,3))
        ])

        self.s_embedding =  tf.keras.Sequential([
            tf.keras.layers.Flatten(),
            tf.keras.layers.RepeatVector(window_size),
            tf.keras.layers.Reshape(target_shape = (window_size,num_station,2)),
            # tf.keras.layers.LayerNormalization()
        ])

    def call(self, inputs):
        time_embeddings, spatial_embeddings = inputs

        time_embeddings = self.t_embedding(time_embeddings)
        spatial_embeddings = self.s_embedding(spatial_embeddings)
        embeddings = self.concat([time_embeddings, spatial_embeddings])
        return embeddings
        # return time_embeddings, spatial_embeddings

In [16]:
layer_ts_1 = TimeSpaceEmbedding()
out_1 = layer_ts_1([train_data['time_embeddings'][:100,:-1],train_data['spatial_embeddings'][:100]])
print('Encoder Time Space Embedding shape: {}'.format(out_1.shape))

layer_ts_2 = TimeSpaceEmbedding()
out_2 = layer_ts_2([train_data['time_embeddings'][:100,-1:],train_data['spatial_embeddings'][:100]])
print('Decoder Time Space Embedding shape: {}'.format(out_2.shape))

Encoder Time Space Embedding shape: (100, 18, 147, 10)
Decoder Time Space Embedding shape: (100, 1, 147, 10)


In [17]:
tf.math.reduce_all(out_1[0,0,:,:8] == out_1[0,0,:,:8])

<tf.Tensor: shape=(), dtype=bool, numpy=True>

In [18]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, normalizer, d_model = 10):
        super().__init__()
        self.norm = normalizer
        self.d_model = d_model
        self.embedding = tf.keras.layers.Dense(d_model)
        self.ts_embedding = TimeSpaceEmbedding()
        self.add = tf.keras.layers.Add()

    def call(self, inputs):
        x, time_embeddings, spatial_embeddings = inputs
        x = self.norm(x) # Shape (batch_size, seq_length)
        x = x[:,:,:,tf.newaxis]
        x = self.embedding(x) #Shape (batch_size, seq_length, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        e = self.ts_embedding([time_embeddings, spatial_embeddings])
        # x = x + e
        x = self.add([x, e])
        return x #Shape (batch_size, seq_length, d_model)


In [19]:
norm = tf.keras.layers.Normalization(axis = None, mean = 188.4318877714359, variance = 120971.63484231419)
# unnorm = tf.keras.layers.Normalization(axis = None, mean = 188.4318877714359, variance = 120971.63484231419, invert = True)
# norm.adapt(train_data['features'][:100])

# unnorm = tf.keras.layers.Normalization(axis = None, invert = True)
# unnorm.adapt(train_data['features'][:100])

In [20]:
norm = tf.keras.layers.Normalization(axis = None, mean = 188.4318877714359, variance = 120971.63484231419) # Normalize All
# norm = tf.keras.layers.Normalization(axis = -1) #Normalize station by station
# norm.adapt(train_data['features'][:100])

layer_pos = PositionalEmbedding(norm,
                                d_model = 10) 
out_3 = layer_pos([train_data['features'][:100], 
                   train_data['time_embeddings'][:100,:-1],
                   train_data['spatial_embeddings'][:100]
                  ])

print('Model Input Shape + Embeddings + Positional Embeddigns: {}'.format(out_3.shape))

Model Input Shape + Embeddings + Positional Embeddigns: (100, 18, 147, 10)


In [21]:
tf.math.reduce_max(out_3)

<tf.Tensor: shape=(), dtype=float32, numpy=28.405294>

In [22]:
tf.math.reduce_min(out_3)

<tf.Tensor: shape=(), dtype=float32, numpy=-32.153503>

In [23]:
layer_pos.weights

[<tf.Variable 'positional_embedding/dense/kernel:0' shape=(1, 10) dtype=float32, numpy=
 array([[-0.1638875 ,  0.66872746, -0.03148836,  0.07698011,  0.09612834,
         -0.09473622, -0.06342274, -0.16402102, -0.71561295,  0.5829312 ]],
       dtype=float32)>,
 <tf.Variable 'positional_embedding/dense/bias:0' shape=(10,) dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>]

In [24]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output, attn_scores = self.mha(
            query=x,
            value=x,
            key=x, 
            return_attention_scores=True)
        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores
        
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
          tf.keras.layers.Dense(dff, activation='relu'),
          tf.keras.layers.Dense(d_model),
          tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x)
        return x

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, key_dim, dff, dropout_rate=0.1):
        super().__init__()

        self.self_attention_temporal = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=key_dim,
            dropout=dropout_rate,
            attention_axes = (1)
            )

        self.self_attention_spatial = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate,
            attention_axes = (2))

        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.self_attention_temporal(x)
        x = self.self_attention_spatial(x)
        x = self.ffn(x)
        return x

In [25]:
layer_encoder = EncoderLayer(d_model = 10, num_heads = 1, key_dim = 64, dff = 2048)
encoder_out = layer_encoder(out_3)
attention_scores_temporal = layer_encoder.self_attention_temporal.last_attn_scores
attention_scores_spatial =layer_encoder.self_attention_spatial.last_attn_scores

max_att_temporal = tf.math.reduce_max(attention_scores_temporal)
max_att_spatial = tf.math.reduce_max(attention_scores_spatial)
print( "Output Context shape: {}".format(encoder_out.shape))
print( "Temportal Attention Scores shape: {}".format(attention_scores_temporal.shape))
print( "Spatial Attention Scores shape: {}".format(attention_scores_spatial.shape))
print('Max Attention Temporal: {}'.format(max_att_temporal))
print('Max Attention Temporal: {}'.format(max_att_spatial))


Output Context shape: (100, 18, 147, 10)
Temportal Attention Scores shape: (100, 147, 1, 18, 18)
Spatial Attention Scores shape: (100, 18, 1, 147, 147)
Max Attention Temporal: 0.1889762282371521
Max Attention Temporal: 0.012486708350479603


In [26]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, normalizer, num_layers, d_model, num_heads,
               key_dim, dff,  dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(
            normalizer, d_model=d_model)

        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         key_dim = key_dim,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        # `x` is token-IDs shape: (batch, seq_len)

        # `x`has (tokes, and Temporal and Positional Embeddings)
        x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

        # Add dropout.
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # Shape `(batch_size, seq_len, d_model)`.

In [27]:
encoder = Encoder(normalizer = norm, num_layers = 2, d_model = 10, num_heads = 1, key_dim = 64, dff = 512)
out_encoder = encoder([train_data['features'][:100], 
                       train_data['time_embeddings'][:100,:-1],
                       train_data['spatial_embeddings'][:100]
                      ])
print("Encoder output shape: {}".format(out_encoder.shape))

Encoder output shape: (100, 18, 147, 10)


In [28]:
# out_encoder[0]

In [29]:
# encoder.enc_layers[0].self_attention_temporal.last_attn_scores

In [30]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,
               *,
               d_model,
               num_heads,
               key_dim,
               dff,
               dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        # self.causal_self_attention = CausalSelfAttention(
        #     num_heads=num_heads,
        #     key_dim=d_model,
        #     dropout=dropout_rate)

        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim=key_dim,
            dropout=dropout_rate,
            attention_axes = (1,2), 
            kernel_initializer=tf.keras.initializers.GlorotNormal(), 
            bias_initializer = 'zeros')

        self.ffn = FeedForward(d_model, dff)

    def call(self, x, context):
#         x = self.causal_self_attention(x=x)
        x = self.cross_attention(x=x, context=context)

        # Cache the last attention scores for plotting later
        self.last_attn_scores = self.cross_attention.last_attn_scores

        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        return x

In [31]:
layer_ts_2 = TimeSpaceEmbedding()
out_2 = layer_ts_2([train_data['time_embeddings'][:100,-1:],train_data['spatial_embeddings'][:100]])
print('Decoder Time Space Embedding shape: {}'.format(out_2.shape))

Decoder Time Space Embedding shape: (100, 1, 147, 10)


In [36]:
decoder_layer = DecoderLayer(d_model = 10, num_heads = 1, key_dim = 64, dff = 512)
decoder_out = decoder_layer(out_2, out_3)
print ("Decoder layer Output shape: {}".format(decoder_out.shape))
print ("Attention Weights shapes: {}".format(decoder_layer.last_attn_scores.shape))
# decoder_layer.cross_attention.weights

Decoder layer Output shape: (100, 1, 147, 10)
Attention Weights shapes: (100, 1, 1, 147, 18, 147)


In [37]:
out_2[0,0]

<tf.Tensor: shape=(147, 10), dtype=float32, numpy=
array([[-0.6763811 , -0.7365518 ,  0.93087375, ...,  1.        ,
        -4.4002867 , -4.845774  ],
       [-0.6763811 , -0.7365518 ,  0.93087375, ...,  1.        ,
        -4.7893205 , -4.9505243 ],
       [-0.6763811 , -0.7365518 ,  0.93087375, ...,  1.        ,
        -4.081254  , -4.777915  ],
       ...,
       [-0.6763811 , -0.7365518 ,  0.93087375, ...,  1.        ,
         2.0638065 , -3.1863048 ],
       [-0.6763811 , -0.7365518 ,  0.93087375, ...,  1.        ,
         2.0039616 , -3.4523752 ],
       [-0.6763811 , -0.7365518 ,  0.93087375, ...,  1.        ,
         2.2462914 ,  3.5132635 ]], dtype=float32)>

In [38]:
# Variability in the Encoder (It seems very small) 
# Target STD > 0.03
for i in range(10):
    tf.print(tf.math.reduce_std(decoder_out[88,0,:,i]))

0.558229744
0.258420646
0.164463669
0.373072
0.390261889
0.192642495
0.292528063
0.306967586
1.4892633
1.29142845


In [39]:
decoder_layer.last_attn_scores[5,0,0,0,:,:5]

<tf.Tensor: shape=(18, 5), dtype=float32, numpy=
array([[0.00038065, 0.00035971, 0.00035763, 0.00035702, 0.00035854],
       [0.00042175, 0.00036087, 0.00035878, 0.00035816, 0.00035833],
       [0.00060641, 0.0003719 , 0.00039881, 0.00036911, 0.00037456],
       [0.00058758, 0.00037266, 0.00039662, 0.00037462, 0.0003755 ],
       [0.00065847, 0.00035173, 0.00040206, 0.00035795, 0.0003686 ],
       [0.00062614, 0.00035116, 0.00039296, 0.00035788, 0.00036645],
       [0.00059631, 0.00035063, 0.0003841 , 0.00035785, 0.0003678 ],
       [0.000608  , 0.00035014, 0.00038849, 0.00035955, 0.00037078],
       [0.00068119, 0.00034969, 0.00038726, 0.00036233, 0.00037083],
       [0.00065637, 0.00034928, 0.00039308, 0.00036432, 0.00036882],
       [0.00062836, 0.00034892, 0.00038349, 0.00036342, 0.00037459],
       [0.00073178, 0.0003486 , 0.00039623, 0.00036602, 0.00037674],
       [0.00038287, 0.00035706, 0.00035904, 0.0003564 , 0.00035471],
       [0.00036044, 0.00036154, 0.00035944, 0.00035882

In [27]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, key_dim, 
                 dff, dropout_rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.ts_embeddings = TimeSpaceEmbedding()
        self.dec_layers = [
            DecoderLayer(d_model=d_model, num_heads=num_heads, 
                         key_dim = key_dim, dff=dff, 
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]

        self.last_attn_scores = None

    def call(self, time_info, space_info, context):

        x = self.ts_embeddings([time_info, space_info])

        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x


In [35]:
decoder = Decoder(num_layers = 1, 
                  d_model = 10, 
                  num_heads = 1, 
                  key_dim = 64, 
                  dff = 256)
decoder_output = decoder(
    time_info = train_data['time_embeddings'][:100,-1:],
    space_info = train_data['spatial_embeddings'][:100],
    context = out_encoder,
    )

In [39]:
decoder_output[0]

<tf.Tensor: shape=(1, 147, 10), dtype=float32, numpy=
array([[[-1.3587011 , -1.2561235 ,  0.7062563 , ...,  1.2659124 ,
          0.30370155, -0.71928406],
        [-1.4011933 , -1.7639025 ,  1.3213884 , ...,  1.0465951 ,
          0.38347372, -0.6001412 ],
        [-1.5314049 , -1.5713661 ,  1.0516503 , ...,  1.1190933 ,
          0.33989525, -0.5602668 ],
        ...,
        [-1.4828482 , -1.6350299 ,  1.142728  , ...,  1.133483  ,
          0.34282243, -0.656931  ],
        [-1.3851902 , -1.2816969 ,  0.80154413, ...,  1.2626746 ,
          0.29927102, -0.666968  ],
        [-1.3979375 , -1.7624282 ,  1.3258314 , ...,  1.0478123 ,
          0.36454266, -0.6042946 ]]], dtype=float32)>

In [34]:
out_3[40,0]

<tf.Tensor: shape=(147, 10), dtype=float32, numpy=
array([[ 0.58125377,  0.17522734,  0.7806701 , ...,  2.437379  ,
         1.0234131 , -2.9449162 ],
       [-1.0602976 , -1.4350901 ,  0.70752794, ...,  0.22954649,
         0.18265942,  0.4411723 ],
       [-1.0602976 , -1.4350901 ,  0.70752794, ...,  0.22954649,
         0.18097934,  0.4411443 ],
       ...,
       [-1.0602976 , -1.4350901 ,  0.70752794, ...,  0.22954649,
         0.16639832,  0.44088632],
       [ 0.7037004 ,  0.2953443 ,  0.78612596, ...,  2.6020658 ,
         1.0709995 , -3.1977165 ],
       [-1.0602976 , -1.4350901 ,  0.70752794, ...,  0.22954649,
         0.16596532,  0.43980032]], dtype=float32)>

In [143]:
f.weights

[<tf.Variable 'dense_29/kernel:0' shape=(10, 1) dtype=float32, numpy=
 array([[ 0.47699338],
        [-0.3378294 ],
        [ 0.52709764],
        [ 0.46894234],
        [ 0.2024687 ],
        [-0.54486436],
        [-0.46729538],
        [-0.23314953],
        [ 0.13476223],
        [ 0.66441625]], dtype=float32)>,
 <tf.Variable 'dense_29/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

In [40]:
# Goal: Prediction has variability 
f = tf.keras.layers.Dense(1)
final_results = f(decoder_output)
final_results#[0]

<tf.Tensor: shape=(100, 1, 147, 1), dtype=float32, numpy=
array([[[[-1.7301033],
         [-1.8509916],
         [-1.9397391],
         ...,
         [-1.8448691],
         [-1.7311057],
         [-1.8390683]]],


       [[[-1.6963948],
         [-1.832441 ],
         [-1.8977286],
         ...,
         [-1.8330915],
         [-1.7028039],
         [-1.8207558]]],


       [[[-1.6562171],
         [-1.8077337],
         [-1.8491088],
         ...,
         [-1.8153813],
         [-1.6776092],
         [-1.7959186]]],


       ...,


       [[[-1.708894 ],
         [-2.2060375],
         [-2.115303 ],
         ...,
         [-2.0606067],
         [-2.0396852],
         [-2.1928253]]],


       [[[-1.6821105],
         [-2.1748266],
         [-2.0945687],
         ...,
         [-2.052512 ],
         [-2.0231206],
         [-2.1615899]]],


       [[[-1.6524347],
         [-2.1427932],
         [-2.0745325],
         ...,
         [-2.0447097],
         [-1.9863095],
         [-2.129510

In [145]:
a = tf.nn.softmax(final_results[0])

In [114]:
tf.math.reduce_sum(a[0,0,:])

<tf.Tensor: shape=(), dtype=float32, numpy=1.0>

In [119]:
a[0,:2,:]

<tf.Tensor: shape=(2, 9000), dtype=float32, numpy=
array([[0.00011923, 0.00011405, 0.00011329, ..., 0.00011233, 0.00010616,
        0.00011123],
       [0.00011774, 0.0001128 , 0.00011329, ..., 0.00011146, 0.00010581,
        0.00011192]], dtype=float32)>